Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

DiskIndex.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 //
00013 // DiskIndex
00014 //
00015 // 8 December 2004 -- tds
00016 //
00017 
00018 #ifndef INDRI_DISKINDEX_HPP
00019 #define INDRI_DISKINDEX_HPP
00020 
00021 #include "indri/Index.hpp"
00022 #include "indri/File.hpp"
00023 #include "Keyfile.hpp"
00024 #include "indri/TermData.hpp"
00025 #include "indri/FieldStatistics.hpp"
00026 #include "indri/CorpusStatistics.hpp"
00027 #include "indri/DiskTermData.hpp"
00028 #include <vector>
00029 #include <string>
00030 #include "indri/BulkTree.hpp"
00031 #include "indri/SequentialReadBuffer.hpp"
00032 
00033 namespace indri {
00034   namespace index {
00035     class DiskIndex : public Index {
00036     private:
00037       indri::thread::Mutex _lock;
00038 
00039       std::string _path;
00040 
00041       indri::file::BulkTreeReader _frequentStringToTerm;
00042       indri::file::BulkTreeReader  _infrequentStringToTerm;
00043 
00044       indri::file::BulkTreeReader _frequentIdToTerm;
00045       indri::file::BulkTreeReader _infrequentIdToTerm;
00046 
00047       indri::file::File _frequentTermsData;
00048 
00049       indri::file::File _documentLengths;
00050       indri::file::File _documentStatistics;
00051 
00052       indri::file::File _invertedFile;
00053       indri::file::File _directFile;
00054       indri::file::File _fieldsFile;
00055 
00056       indri::file::SequentialReadBuffer _lengthsBuffer;
00057 
00058       std::vector<FieldStatistics> _fieldData;
00059       lemur::api::DOCID_T  _documentBase;
00060       int _infrequentTermBase;
00061 
00062       indri::index::DiskTermData* _fetchTermData( lemur::api::TERMID_T termID );
00063       indri::index::DiskTermData* _fetchTermData( const char* termString );
00064 
00065       CorpusStatistics _corpusStatistics;
00066       void _readManifest( const std::string& manifestPath );
00067 
00068     public:
00069       DiskIndex() : _lengthsBuffer(_documentLengths) {}
00070 
00071       void open( const std::string& base, const std::string& relative );
00072       void close();
00073 
00074       const std::string& path();
00075       lemur::api::DOCID_T documentBase();
00076 
00077       int field( const char* fieldName );
00078       int field( const std::string& fieldName );
00079       std::string field( int fieldID );
00080 
00081       lemur::api::TERMID_T term( const char* term );
00082       lemur::api::TERMID_T term( const std::string& term );
00083       std::string term( lemur::api::TERMID_T termID );
00084 
00085       int documentLength( lemur::api::DOCID_T documentID );
00086       UINT64 documentCount();
00087       UINT64 documentCount( const std::string& term );
00088       lemur::api::DOCID_T documentMaximum();
00089       UINT64 uniqueTermCount();
00090 
00091       UINT64 termCount( const std::string& term );
00092       UINT64 termCount();
00093 
00094       UINT64 fieldTermCount( const std::string& field );
00095       UINT64 fieldTermCount( const std::string& field, const std::string& term );
00096 
00097       UINT64 fieldDocumentCount( const std::string& field );
00098       UINT64 fieldDocumentCount( const std::string& field, const std::string& term );
00099 
00100       //
00101       // Lists
00102       //
00103       
00104       DocListIterator* docListIterator( lemur::api::TERMID_T termID );
00105       DocListIterator* docListIterator( const std::string& term );
00106       DocListFileIterator* docListFileIterator();
00107       DocExtentListIterator* fieldListIterator( int fieldID );
00108       DocExtentListIterator* fieldListIterator( const std::string& field );
00109       const TermList* termList( lemur::api::DOCID_T documentID );
00110       TermListFileIterator* termListFileIterator();
00111 
00112       VocabularyIterator* vocabularyIterator();
00113       VocabularyIterator* frequentVocabularyIterator();
00114       VocabularyIterator* infrequentVocabularyIterator();
00115 
00116       DocumentDataIterator* documentDataIterator();
00117 
00118       indri::thread::Lockable* iteratorLock();
00119       indri::thread::Lockable* statisticsLock();
00120       // cache limit
00121       enum {
00123         // 250,000 documents/megabyte.
00124         MAX_DOCLENGTHS_CACHE = 20*1024*1024
00125       };
00126     };
00127   }
00128 }
00129 
00130 #endif // INDRI_DISKINDEX_HPP

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4