Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

IndexWriter.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 //
00013 // IndexWriter
00014 //
00015 // 26 November 2004 -- tds
00016 //
00017 
00018 #ifndef INDRI_INDEXWRITER_HPP
00019 #define INDRI_INDEXWRITER_HPP
00020 
00021 #include <vector>
00022 #include <utility>
00023 #include <queue>
00024 
00025 #include "lemur-compat.hpp"
00026 #include "indri/indri-platform.h"
00027 #include "indri/greedy_vector"
00028 #include "indri/TermData.hpp"
00029 #include "Keyfile.hpp"
00030 #include "indri/Index.hpp"
00031 #include "indri/DocListFileIterator.hpp"
00032 #include "indri/File.hpp"
00033 #include "indri/SequentialWriteBuffer.hpp"
00034 #include "indri/CorpusStatistics.hpp"
00035 #include "indri/FieldStatistics.hpp"
00036 #include "indri/TermBitmap.hpp"
00037 #include "indri/TermRecorder.hpp"
00038 #include "indri/TermTranslator.hpp"
00039 #include "indri/DeletedDocumentList.hpp"
00040 #include "indri/BulkTree.hpp"
00041 
00042 namespace indri {
00043   namespace index {
00044 
00045     struct WriterIndexContext {
00046       struct greater {
00047       private:
00048         indri::index::DocListFileIterator::iterator_greater _iterator_greater;
00049   
00050         int _compareTerms( const WriterIndexContext* const&  one, const WriterIndexContext* const& two ) const {
00051           const char* oneTerm = one->iterator->currentEntry()->termData->term;
00052           const char* twoTerm = two->iterator->currentEntry()->termData->term;
00053 
00054           return strcmp( oneTerm, twoTerm );
00055         }
00056 
00057         int _compareDocuments( const WriterIndexContext* const&  one, const WriterIndexContext* const& two ) const {
00058           const indri::index::DocListIterator::DocumentData* oneData = one->iterator->currentEntry()->iterator->currentEntry();
00059           const indri::index::DocListIterator::DocumentData* twoData = two->iterator->currentEntry()->iterator->currentEntry();
00060 
00061           lemur::api::DOCID_T oneDocument = oneData ? oneData->document + one->documentOffset : 0;
00062           lemur::api::DOCID_T twoDocument = twoData ? twoData->document + two->documentOffset : 0;
00063 
00064           return oneDocument > twoDocument;
00065         }
00066 
00067       public:
00068         bool operator () ( const WriterIndexContext* const&  one, const WriterIndexContext* const& two ) const {
00069           assert( !one->iterator->finished() && !two->iterator->finished() );
00070 
00071           int result = _compareTerms( one, two );
00072 
00073           // if terms don't match, we're done
00074           if( result != 0 )
00075             return result > 0;
00076 
00077           // terms match, so go by document
00078           return _compareDocuments( one, two ) > 0;
00079         }
00080       };
00081 
00082       WriterIndexContext( indri::index::Index* _index, indri::index::DeletedDocumentList* _deletedList, lemur::api::DOCID_T _documentOffset ) {
00083         deletedList = _deletedList;
00084         documentOffset = _documentOffset;
00085 
00086         bitmap = new indri::index::TermBitmap;
00087         index = _index;
00088         wasInfrequentCount = 0;
00089         wasFrequentCount = 0;
00090 
00091         if( index->iteratorLock() )
00092           index->iteratorLock()->lock();
00093     
00094         iterator = index->docListFileIterator();
00095         iterator->startIteration();
00096 
00097         newlyFrequent = new indri::index::TermRecorder;
00098         oldFrequent = new indri::index::TermRecorder;
00099         oldInfrequent = new indri::utility::HashTable<lemur::api::TERMID_T, lemur::api::TERMID_T>;
00100 
00101         // DEBUG
00102         sequenceCount = 0;
00103       }
00104 
00105       ~WriterIndexContext() {
00106         delete iterator;
00107 
00108         if( index->iteratorLock() )
00109           index->iteratorLock()->unlock();
00110 
00111         delete oldFrequent;
00112         delete newlyFrequent;
00113         delete oldInfrequent;
00114         delete bitmap;
00115       }
00116 
00117       indri::index::DocListFileIterator* iterator;
00118       indri::index::TermBitmap* bitmap;
00119       indri::index::Index* index;
00120 
00121       int wasFrequentCount;
00122       int wasInfrequentCount;
00123       int sequenceCount;
00124       indri::index::TermRecorder* newlyFrequent;
00125       indri::index::TermRecorder* oldFrequent;
00126       indri::utility::HashTable<lemur::api::TERMID_T, lemur::api::TERMID_T>* oldInfrequent;
00127 
00128       indri::index::DeletedDocumentList* deletedList;
00129       lemur::api::DOCID_T documentOffset;
00130     };
00131 
00132     typedef std::priority_queue<WriterIndexContext*,
00133                                 std::vector<WriterIndexContext*>,
00134                                 WriterIndexContext::greater> invertedlist_pqueue;
00135 
00136     class IndexWriter {
00137     private:
00138       struct disktermdata_count_greater {
00139         bool operator () ( const DiskTermData* one, const DiskTermData* two ) const {
00140           return one->termData->corpus.totalCount > two->termData->corpus.totalCount;
00141         }
00142       };
00143 
00144       struct disktermdata_alpha_less {
00145         bool operator () ( const DiskTermData* one, const DiskTermData* two ) const {
00146           return strcmp( one->termData->term, two->termData->term ) < 0;
00147         }
00148       };
00149 
00150       struct keyfile_pair {
00151         indri::file::BulkTreeWriter* stringMap;
00152         indri::file::BulkTreeWriter* idMap;
00153       };
00154 
00155       keyfile_pair _infrequentTerms;
00156       keyfile_pair _frequentTerms;
00157       indri::file::File _frequentTermsData;
00158 
00159       indri::file::BulkTreeReader _infrequentTermsReader;
00160       indri::file::BulkTreeReader _frequentTermsReader;
00161 
00162       indri::file::File _documentStatistics;
00163       indri::file::File _documentLengths;
00164 
00165       indri::file::File _invertedFile;
00166       indri::file::File _directFile;
00167       indri::file::File _fieldsFile;
00168 
00169       indri::file::SequentialWriteBuffer* _invertedOutput;
00170 
00171       indri::utility::greedy_vector<indri::index::DiskTermData*> _topTerms;
00172       int _topTermsCount;
00173       indri::utility::Buffer _termDataBuffer;
00174 
00175       int _isFrequentCount;
00176       lemur::api::DOCID_T _documentBase;
00177       indri::index::CorpusStatistics _corpus;
00178       std::vector<indri::index::Index::FieldDescription> _fields;
00179       std::vector<indri::index::FieldStatistics> _fieldData;
00180 
00181       void _writeManifest( const std::string& path );
00182       void _writeSkip( indri::file::SequentialWriteBuffer* buffer, lemur::api::DOCID_T document, int length );
00183       void _writeBatch( indri::file::SequentialWriteBuffer* buffer, lemur::api::DOCID_T document, int length, indri::utility::Buffer& data );
00184 
00185       void _writeFieldLists( std::vector<WriterIndexContext*>& contexts, const std::string& path );
00186       void _writeFieldList( indri::file::SequentialWriteBuffer& output, int fieldIndex, std::vector<indri::index::DocExtentListIterator*>& iterators, std::vector<WriterIndexContext*>& contexts );
00187 
00188       void _pushInvertedLists( indri::utility::greedy_vector<WriterIndexContext*>& lists, invertedlist_pqueue& queue );
00189       void _fetchMatchingInvertedLists( indri::utility::greedy_vector<WriterIndexContext*>& lists, invertedlist_pqueue& queue );
00190       void _writeStatistics( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, UINT64& startOffset );
00191       void _writeInvertedLists( std::vector<WriterIndexContext*>& contexts );
00192 
00193       void _storeIdEntry( IndexWriter::keyfile_pair& pair, indri::index::DiskTermData* diskTermData );
00194       void _storeStringEntry( IndexWriter::keyfile_pair& pair, indri::index::DiskTermData* diskTermData );
00195 
00196       void _storeTermEntry( IndexWriter::keyfile_pair& pair, indri::index::DiskTermData* diskTermData );
00197       void _storeFrequentTerms();
00198       void _addInvertedListData( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, indri::utility::Buffer& listBuffer, UINT64& endOffset );
00199       void _storeMatchInformation( indri::utility::greedy_vector<WriterIndexContext*>& lists, int sequence, indri::index::TermData* termData, UINT64 startOffset, UINT64 endOffset );
00200 
00201       lemur::api::TERMID_T _lookupTermID( indri::file::BulkTreeReader& keyfile, const char* term );
00202 
00203       void _buildIndexContexts( std::vector<WriterIndexContext*>& contexts, std::vector<indri::index::Index*>& indexes, indri::index::DeletedDocumentList& deletedList );
00204       void _buildIndexContexts( std::vector<WriterIndexContext*>& contexts, std::vector<indri::index::Index*>& indexes, std::vector<indri::index::DeletedDocumentList*>& deletedLists, const std::vector<lemur::api::DOCID_T>& documentOffsets );
00205       
00206       void _writeDirectLists( std::vector<WriterIndexContext*>& contexts );
00207       void _writeDirectLists( WriterIndexContext* context,
00208                               indri::file::SequentialWriteBuffer* directOutput,
00209                               indri::file::SequentialWriteBuffer* lengthsOutput,
00210                               indri::file::SequentialWriteBuffer* dataOutput );
00211 
00212       void _constructFiles( const std::string& path );
00213       void _closeFiles( const std::string& path );
00214       void _openTermsReaders( const std::string& path );
00215 
00216       indri::index::TermTranslator* _buildTermTranslator( indri::file::BulkTreeReader& newInfrequentTerms,
00217                                                           indri::file::BulkTreeReader& newFrequentTerms,
00218                                                           indri::index::TermRecorder& oldFrequentTermsRecorder,
00219                                                           indri::utility::HashTable<lemur::api::TERMID_T, lemur::api::TERMID_T>* oldInfrequent,
00220                                                           indri::index::TermRecorder& newFrequentTermsRecorder,
00221                                                           indri::index::Index* index,
00222                                                           indri::index::TermBitmap* bitmap );
00223       
00224       // buffers for _lookupTermID
00225       char *_compressedData;
00226       char *_uncompressedData;
00227       int _dataSize;
00228 
00229       enum {
00230         TOPDOCS_DOCUMENT_COUNT = 1000,
00231         FREQUENT_TERM_COUNT = 1000
00232       };
00233 
00234     public:
00235       IndexWriter();
00236       void write( indri::index::Index& index,
00237                   std::vector<indri::index::Index::FieldDescription>& fields,
00238                   indri::index::DeletedDocumentList& deletedList,
00239                   const std::string& fileName );
00240       void write( std::vector<indri::index::Index*>& indexes,
00241                   std::vector<indri::index::Index::FieldDescription>& fields,
00242                   indri::index::DeletedDocumentList& deletedList,
00243                   const std::string& fileName );
00244       void write( std::vector<indri::index::Index*>& indexes,
00245                   std::vector<indri::index::Index::FieldDescription>& fields,
00246                   std::vector<indri::index::DeletedDocumentList*>& deletedLists, 
00247                   const std::vector<lemur::api::DOCID_T>& documentMaximums,
00248                   const std::string& path );
00249     };
00250   }
00251 }
00252 
00253 #endif // INDRI_INDEXWRITER_HPP

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4