Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

KeyfileIncIndex.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 #ifndef _LEMUR_KEYFILE_INCINDEX_HPP
00014 #define _LEMUR_KEYFILE_INCINDEX_HPP
00015 
00016 /*
00017  * NAME DATE - COMMENTS
00018  * tnt 01/02 - created
00019  * dmf 07/03 - converted to incremental berkeley db btree indexer with
00020  * integrated document manager.
00021  * tds 09/03 - modified from BTIncIndex to use keyfile
00022  * dmf 12/03 - update to 2.1 API and remove parser/docmgr components.
00023  */
00024 #include "common_headers.hpp"
00025 #include "Index.hpp"
00026 #include "InvFPDocList.hpp"
00027 #include "InvFPTermList.hpp"
00028 #include "InvFPTypes.hpp"
00029 #include "BasicCollectionProps.hpp"
00030 #include "Param.hpp"
00031 #include "PushIndex.hpp"
00032 #include "MemCache.hpp"
00033 #include "Keyfile.hpp"
00034 #include "KeyfileDocMgr.hpp"
00035 #include "ReadBuffer.hpp"
00036 #include "WriteBuffer.hpp"
00037 #include "TermCache.hpp"
00038 #include <cstring>
00039 #include <queue>
00040 namespace lemur
00041 {
00042   namespace index 
00043   {
00044     
00045     // for counts array
00046 #define UNIQUE_TERMS 0
00047 #define TOTAL_TERMS  1
00048 #define DOCS         2
00049 #define DT_FILES     3
00050 #define INV_FILES    4
00051     // keyref.h -- 512
00052 #define MAX_DOCID_LENGTH 512
00053 #define MAX_TERM_LENGTH 512
00054 
00055 #define KEYFILE_MAX_SEGMENTS (16)
00056 
00057     // we love multiple inheritance
00058 
00071     class KeyfileIncIndex : public PushIndex, public lemur::api::Index {
00072     public:
00074       class record {
00075       public:
00077         lemur::file::File::offset_type offset;
00079         int len;
00081         int totalLen;
00083         int num;     
00084       };
00086       struct SegmentOffset {
00088         unsigned int segment;
00090         unsigned int length;
00092         lemur::file::File::offset_type offset;
00093       };
00095       struct TermData {
00097         lemur::api::COUNT_T totalCount;
00099         lemur::api::COUNT_T documentCount;
00101         SegmentOffset segments[ KEYFILE_MAX_SEGMENTS ];
00102       };
00105       KeyfileIncIndex(const string &prefix, int cachesize=128000000, 
00106                       lemur::api::DOCID_T startdocid=1);
00108       KeyfileIncIndex();
00110       ~KeyfileIncIndex();
00111 
00113       void setName(const string &prefix);
00114 
00116       bool beginDoc(const lemur::parse::DocumentProps* dp);
00117 
00119       bool addTerm(const lemur::api::Term& t);
00120 
00122       void endDoc(const lemur::parse::DocumentProps* dp);
00123 
00125       virtual void endDoc(const lemur::parse::DocumentProps* dp, const string &mgr);
00126 
00128       void endCollection(const lemur::parse::CollectionProps* cp);
00129 
00131       void setDocManager(const string &mgrID);
00132     
00133     protected:
00135       bool tryOpen();
00137       void writeTOC(const lemur::parse::CollectionProps* cp);
00139       void writeCache( bool lastRun = false );
00141       void lastWriteCache();
00142 
00144       void mergeCacheSegments();
00146       void writeCacheSegment();
00148       void writeDocMgrIDs();
00151       int docMgrID(const string &mgr);
00153       virtual void doendDoc(const lemur::parse::DocumentProps* dp, int mgrid);
00155       int listlengths;
00156   
00157     public:
00159 
00160 
00162       bool open(const string &indexName);
00164 
00166 
00167 
00169       lemur::api::TERMID_T term(const lemur::api::TERM_T &word) const;
00170 
00172       const lemur::api::TERM_T term(lemur::api::TERMID_T termID) const;
00173 
00175       lemur::api::DOCID_T document(const lemur::api::EXDOCID_T &docIDStr) const;
00176 
00178       const lemur::api::EXDOCID_T document(lemur::api::DOCID_T docID) const; 
00179 
00181       const lemur::api::DocumentManager *docManager(lemur::api::DOCID_T docID) const;
00182 
00183       const lemur::parse::CollectionProps *collectionProps() const;
00185 
00187 
00188 
00190       lemur::api::COUNT_T docCount() const { return counts[DOCS]; };
00191 
00193       lemur::api::COUNT_T termCountUnique() const { return counts[UNIQUE_TERMS]; };
00194 
00196       lemur::api::COUNT_T termCount(lemur::api::TERMID_T termID) const;
00197 
00199       lemur::api::COUNT_T termCount() const { return counts[TOTAL_TERMS]; };
00200 
00202       float docLengthAvg() const;
00203 
00205       lemur::api::COUNT_T docCount(lemur::api::TERMID_T termID) const;
00206 
00208       lemur::api::COUNT_T docLength(lemur::api::DOCID_T docID) const;
00209 
00211       virtual lemur::api::COUNT_T totaldocLength (lemur::api::DOCID_T docID) const;
00212 
00214       lemur::api::COUNT_T docLengthCounted(lemur::api::DOCID_T docID) const;
00215 
00217 
00219 
00220 
00221       lemur::api::DocInfoList* docInfoList(lemur::api::TERMID_T termID) const;
00222 
00224       lemur::api::TermInfoList* termInfoList(lemur::api::DOCID_T docID) const;
00226       lemur::api::TermInfoList* termInfoListSeq(lemur::api::DOCID_T docID) const;
00227 
00229 
00231       void setMesgStream(ostream * lemStream);
00233       void addKnownTerm( lemur::api::TERMID_T termID, lemur::api::LOC_T position );
00235       lemur::api::TERMID_T addUnknownTerm( const InvFPTerm* term );
00237       lemur::api::TERMID_T addUncachedTerm( const InvFPTerm* term );
00238 
00239     protected:
00241       void openDBs();
00243       void openSegments();
00245       void createDBs();
00246 
00248       void fullToc();
00250       bool docMgrIDs();
00252       record fetchDocumentRecord( lemur::api::DOCID_T key ) const;
00254       void addDocumentLookup( lemur::api::DOCID_T documentKey, const char* documentName );
00256       void addTermLookup( lemur::api::TERMID_T termKey, const char* termSpelling );
00258       void addGeneralLookup( lemur::file::Keyfile& numberNameIndex, 
00259                              lemur::file::Keyfile& nameNumberIndex, 
00260                              lemur::api::TERMID_T number, const char* name );
00262       InvFPDocList* internalDocInfoList(lemur::api::TERMID_T termID) const;
00264       void _updateTermlist( InvFPDocList* curlist, lemur::api::LOC_T position );
00266       int _cacheSize();
00268       void _computeMemoryBounds( int memorySize );
00270       void _resetEstimatePoint();
00272       lemur::api::COUNT_T* counts;    
00274       std::vector<std::string> names;
00276       float aveDocLen; 
00278       vector<std::string> docmgrs;
00280       ostream* msgstream;
00281 
00282       // All database handles are marked mutable since they sometimes
00283       // must be used to fetch values during const methods
00285       mutable lemur::file::Keyfile invlookup;
00286   
00287       // int <-> string mappings for documents and terms
00289       mutable lemur::file::Keyfile dIDs;
00291       mutable lemur::file::Keyfile dSTRs;
00293       mutable lemur::file::Keyfile tIDs;
00295       mutable lemur::file::Keyfile tSTRs;
00297       mutable lemur::file::File dtlookup; 
00299       lemur::file::ReadBuffer* dtlookupReadBuffer; 
00302       mutable lemur::file::File writetlist; 
00303 
00305       mutable char termKey[MAX_TERM_LENGTH];
00307       mutable char docKey[MAX_DOCID_LENGTH];
00309       int _listsSize;
00311       int _memorySize;
00313       std::string name;
00315       vector<InvFPDocList*> invertlists; 
00317       vector<LocatedTerm> termlist; 
00319       int curdocmgr; 
00321       vector<lemur::api::DocumentManager*> docMgrs; 
00323       lemur::utility::TermCache _cache;
00325       mutable lemur::parse::BasicCollectionProps* cprops;
00326 
00328       std::vector<lemur::file::File*> _segments;
00330       lemur::api::TERMID_T _largestFlushedTermID;
00332       int _estimatePoint; 
00334       bool ignoreDoc;  
00336       bool _readOnly;
00337     };
00338   }
00339 }
00340 
00341 
00342 #endif //_LEMUR_KEYFILE_INCINDEX_HPP

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4