Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

CompressedCollection.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // CompressedCollection.hpp
00015 //
00016 // 12 May 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_COMPRESSEDCOLLECTION_HPP
00020 #define INDRI_COMPRESSEDCOLLECTION_HPP
00021 
00022 #include "indri/Collection.hpp"
00023 #include "string-set.h"
00024 #include <string>
00025 #include <vector>
00026 #include "Keyfile.hpp"
00027 #include "indri/Buffer.hpp"
00028 #include "indri/SequentialWriteBuffer.hpp"
00029 #include "indri/SequentialReadBuffer.hpp"
00030 #include "indri/HashTable.hpp"
00031 #include "indri/File.hpp"
00032 #include "indri/Mutex.hpp"
00033 #include "IndexTypes.hpp"
00034 #include "indri/DeletedDocumentList.hpp"
00035 
00036 typedef struct z_stream_s* z_stream_p;
00037 
00038 namespace indri
00039 {
00040   namespace collection
00041   {
00042     
00043     class CompressedCollection : public Collection {
00044     private:
00045       indri::thread::Mutex _lock;
00046 
00047       std::string _basePath;
00048       lemur::file::Keyfile _lookup;
00049       indri::file::File _storage;
00050       indri::file::SequentialWriteBuffer* _output;
00051       indri::utility::Buffer _positionsBuffer;
00052       z_stream_p _stream;
00053 
00054       indri::utility::HashTable<const char*, lemur::file::Keyfile*> _reverseLookups;
00055       indri::utility::HashTable<const char*, lemur::file::Keyfile*> _forwardLookups;
00056       String_set* _strings;
00057 
00058       void _writePositions( indri::api::ParsedDocument* document, int& keyLength, int& valueLength );
00059       void _writeMetadataItem( indri::api::ParsedDocument* document, int i, int& keyLength, int& valueLength );
00060       void _writeText( indri::api::ParsedDocument* document, int& keyLength, int& valueLength );
00061       void _writeContent( indri::api::ParsedDocument* document, int& keyLength, int& valueLength );
00062       void _writeContentLength( indri::api::ParsedDocument* document, int& keyLength, int& valueLength );
00063 
00064       void _readPositions( indri::api::ParsedDocument* document, const void* positionData, int positionDataLength );
00065 
00066       void _removeForwardLookups( indri::index::DeletedDocumentList& deletedList, lemur::file::Keyfile& keyfile );
00067       void _removeReverseLookups( indri::index::DeletedDocumentList& deletedList, lemur::file::Keyfile& keyfile );
00068 
00069       void _copyForwardLookup( const std::string& name,
00070                                lemur::file::Keyfile& other,
00071                                indri::index::DeletedDocumentList& deletedList,
00072                                lemur::api::DOCID_T documentOffset );
00073 
00074       void _copyReverseLookup( const std::string& name,
00075                                lemur::file::Keyfile& other,
00076                                indri::index::DeletedDocumentList& deletedList,
00077                                lemur::api::DOCID_T documentOffset );
00078 
00079 
00080       void _copyStorageEntry( indri::file::SequentialReadBuffer* input,
00081                               indri::file::SequentialWriteBuffer* output, 
00082                               int key,
00083                               UINT64 position,
00084                               UINT64 length, 
00085                               lemur::file::Keyfile& lookup );
00086       void _copyStorageData( indri::file::SequentialReadBuffer* input,
00087                              indri::file::SequentialWriteBuffer* output,
00088                              indri::index::DeletedDocumentList& deletedList,
00089                              lemur::api::DOCID_T documentOffset,
00090                              lemur::file::Keyfile& sourceLookup,
00091                              lemur::file::Keyfile& destLookup,
00092                              UINT64 storageLength );
00093       void _copyForwardLookup( const std::string& name, lemur::file::Keyfile& other, lemur::api::DOCID_T documentOffset );
00094 
00095       bool _storeDocs;      
00096     public:
00097       CompressedCollection();
00098       ~CompressedCollection();
00099 
00100       void create( const std::string& fileName );
00101       void create( const std::string& fileName, const std::vector<std::string>& indexedFields );
00102       void create( const std::string& fileName, const std::vector<std::string>& forwardIndexedFields, const std::vector<std::string>& reverseIndexedFields,  bool storeDocs = true );
00103       void reopen( const std::string& fileName );
00104       void open( const std::string& fileName );
00105       void openRead( const std::string& fileName );
00106       void close();
00107       bool exists(lemur::api::DOCID_T documentID);
00108       indri::api::ParsedDocument* retrieve( lemur::api::DOCID_T documentID );
00109       std::string retrieveMetadatum( lemur::api::DOCID_T documentID, const std::string& attributeName );
00110       std::vector<indri::api::ParsedDocument*> retrieveByMetadatum( const std::string& attributeName, const std::string& value );
00111       std::vector<lemur::api::DOCID_T> retrieveIDByMetadatum( const std::string& attributeName, const std::string& value );
00112 
00113       void addDocument( lemur::api::DOCID_T documentID, indri::api::ParsedDocument* document );
00114       void compact( indri::index::DeletedDocumentList& deletedList );
00115       void append( indri::collection::CompressedCollection& other, indri::index::DeletedDocumentList& deletedList, lemur::api::DOCID_T documentOffset );
00116 
00117       std::vector<std::string> forwardFields();
00118       std::vector<std::string> reverseFields();
00119     };
00120   }
00121 }
00122 
00123 #endif // INDRI_COMPRESSEDCOLLECTION_HPP

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4