Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

IndexEnvironment.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // IndexEnvironment
00015 //
00016 // 19 July 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_INDEXENVIRONMENT_HPP
00020 #define INDRI_INDEXENVIRONMENT_HPP
00021 
00022 #include <string>
00023 #include "indri/Parameters.hpp"
00024 #include "indri/HTMLParser.hpp"
00025 #include "indri/ConflationPattern.hpp"
00026 #include "indri/Repository.hpp"
00027 #include "indri/IndriParser.hpp"
00028 #include "indri/IndriTokenizer.hpp"
00029 #include "indri/DocumentIterator.hpp"
00030 #include "indri/AnchorTextAnnotator.hpp"
00031 #include "indri/OffsetAnnotationAnnotator.hpp"
00032 #include "indri/OffsetMetadataAnnotator.hpp"
00033 #include "indri/Transformation.hpp"
00034 #include "indri/DocumentIteratorFactory.hpp"
00035 #include "indri/ParserFactory.hpp"
00036 #include "indri/FileClassEnvironmentFactory.hpp"
00037 #include <map>
00038 namespace indri 
00039 {
00041   namespace api 
00042   {
00043     
00044     struct IndexStatus {
00045       enum action_code {
00046         FileOpen,
00047         FileSkip,
00048         FileError,
00049         FileClose,
00050         DocumentCount
00051       };
00052 
00053       virtual void operator () ( int code, const std::string& documentPath, const std::string& error, int documentsIndexed, int documentsSeen ) {
00054         status( code, documentPath, error, documentsIndexed, documentsSeen );
00055       }
00056 
00057       virtual void status( int code, const std::string& documentPath, const std::string& error, int documentsIndexed, int documentsSeen ) {};
00058     };
00059 
00066     class IndexEnvironment {
00067     private:
00068       IndexStatus* _callback;
00069       Parameters* _options;
00070 
00071       std::string _repositoryPath;
00072       indri::collection::Repository _repository;
00073       int _documents;
00074       std::string _error;
00075 
00076       std::string _offsetAnnotationsRoot;
00077       std::string _offsetMetadataRoot;
00078       std::string _anchorTextRoot;
00079       std::string _documentRoot;
00080 
00081       Parameters _parameters;
00082       indri::parse::FileClassEnvironmentFactory _fileClassFactory;
00083 
00084       indri::parse::AnchorTextAnnotator _annotator;
00085       indri::parse::OffsetAnnotationAnnotator _oa_annotator;
00086       indri::parse::OffsetMetadataAnnotator _om_annotator;
00087 
00088       std::map<std::string, indri::parse::FileClassEnvironment*> _environments;
00089 
00090       int _documentsIndexed;
00091       int _documentsSeen;
00092 
00093       void _getParsingContext( indri::parse::Parser** parser,
00094                                indri::parse::Tokenizer** tokenizer,
00095                                indri::parse::DocumentIterator** iterator,
00096                                indri::parse::Conflater** conflater,
00097                                const std::string& extension );
00098 
00099       std::vector<indri::parse::Transformation*> _createAnnotators( const std::string& fileName, 
00100                                                                     const std::string& fileClass, 
00101                                                                     indri::parse::Conflater** conflater);
00102 
00103       ParsedDocument* _applyAnnotators( std::vector<indri::parse::Transformation*>& annotators, 
00104                                         ParsedDocument* parsed ); 
00105 
00106 
00107     public:
00108       friend class QueryEnvironment;
00109 
00110       IndexEnvironment();
00111       ~IndexEnvironment();
00112 
00115       void setOffsetAnnotationsPath( const std::string& offsetAnnotationsRoot );
00116 
00119       void setOffsetMetadataPath( const std::string& offsetMetadataRoot );
00120 
00123       void setAnchorTextPath( const std::string& anchorTextRoot );
00124 
00127       void setDocumentRoot( const std::string& documentRoot );
00128 
00143       void addFileClass( const std::string& name, 
00144                          const std::string& iterator,
00145                          const std::string& parser,
00146                          const std::string& tokenizer,
00147                          const std::string& startDocTag,
00148                          const std::string& endDocTag,
00149                          const std::string& endMetadataTag,
00150                          const std::vector<std::string>& include,
00151                          const std::vector<std::string>& exclude,
00152                          const std::vector<std::string>& index,
00153                          const std::vector<std::string>& metadata, 
00154                          const std::map<indri::parse::ConflationPattern*,std::string>& conflations );
00155 
00158       indri::parse::FileClassEnvironmentFactory::Specification *getFileClassSpec( const std::string& name) {
00159         return _fileClassFactory.getFileClassSpec(name);
00160       }
00161 
00164       void addFileClass( const indri::parse::FileClassEnvironmentFactory::Specification &spec ){
00165         _fileClassFactory.addFileClass(spec);
00166       }
00167   
00173       void setIndexedFields( const std::vector<std::string>& fieldNames );
00174 
00179       void setNumericField( const std::string& fieldName, bool isNumeric,
00180                             const std::string &parserName = "");
00181 
00185       void setOrdinalField( const std::string& fieldName, bool isOrdinal);
00186 
00190       void setParentalField( const std::string& fieldName, bool isParental);
00191 
00192 
00201       void setMetadataIndexedFields( const std::vector<std::string>& forwardFieldNames, const std::vector<std::string>& backwardFieldNames );
00202 
00205       void setStopwords( const std::vector<std::string>& stopwords );
00206 
00209       void setStemmer( const std::string& stemmer );
00210 
00213       void setMemory( UINT64 memory );
00214 
00217       void setNormalization( bool flag );
00218 
00221       void setStoreDocs( bool flag );
00222 
00225       void setOffsetAnnotationIndexHint(indri::parse::OffsetAnnotationIndexHint hintType);
00226 
00230       void create( const std::string& repositoryPath, IndexStatus* callback = 0 );
00231 
00235       void open( const std::string& repositoryPath, IndexStatus* callback = 0 );
00236 
00238       void close();
00239   
00245       void addFile( const std::string& fileName );
00246 
00250       void addFile( const std::string& fileName, const std::string& fileClass );
00251 
00257       lemur::api::DOCID_T addString( const std::string& documentString, 
00258                      const std::string& fileClass, 
00259                      const std::vector<indri::parse::MetadataPair>& metadata );
00260 
00271       lemur::api::DOCID_T addString( const std::string& documentString, 
00272                      const std::string& fileClass, 
00273                      const std::vector<indri::parse::MetadataPair>& metadata, 
00274                      const std::vector<indri::parse::TagExtent *> &tags );
00275       
00278       lemur::api::DOCID_T addParsedDocument( ParsedDocument* document );
00279 
00282       void deleteDocument( lemur::api::DOCID_T documentID );
00283 
00285       int documentsIndexed();
00286 
00290       int documentsSeen();
00291 
00294       void compact();
00295 
00302       static void merge( const std::string& outputIndex, const std::vector<std::string>& inputIndexes );
00303     };
00304   }
00305 }
00306 
00307 #endif // INDRI_INDEXENVIRONMENT_HPP
00308 

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4