Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

TermBitmap.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2005 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 //
00013 // TermBitmap
00014 //
00015 // 12 January 2005 -- tds
00016 //
00017 
00018 
00019 #ifndef INDRI_TERMBITMAP_HPP
00020 #define INDRI_TERMBITMAP_HPP
00021 
00022 #include "indri/Buffer.hpp"
00023 #include "indri/delete_range.hpp"
00024 
00025 namespace indri {
00026   namespace index {
00060     class TermBitmap {
00061     private:
00062       std::vector<indri::utility::Buffer*> _maps;
00063       int _fromBase;
00064       int _toBase;
00065       int _lastFrom;
00066       char* _current;
00067 
00068       void _addBufferIfNecessary() {
00069         if( !_maps.size() || _maps.back()->position() == _maps.back()->size() )
00070           _maps.push_back( new indri::utility::Buffer( 64*1024 ) );
00071       }
00072 
00073       indri::utility::Buffer* _findBuffer( int from ) {
00074         assert( from >= 0 );
00075         
00076         int left = 0;
00077         int right = (int)_maps.size()-1;
00078 
00079         if( _maps.size() == 0 )
00080           return 0;
00081 
00082         while( right - left > 1 ) {
00083           int middle = left + (right - left) / 2;
00084           indri::utility::Buffer* mid = _maps[ middle ];
00085 
00086           if( from < *(INT32*) mid->front() ) {
00087             right = middle;
00088           } else {
00089             left = middle;
00090           }
00091         }
00092 
00093         if( right == left )
00094           return _maps[left];
00095 
00096         int rightFront = *(INT32*) _maps[right]->front();
00097 
00098         if( from < rightFront ) {
00099           return _maps[left];
00100         }
00101 
00102         return _maps[right];
00103       }
00104 
00105       const char* _findInBuffer( indri::utility::Buffer* b, int from ) {
00106         const char* start = b->front();
00107         const char* end = b->front() + b->position();
00108 
00109         while( end - start > 32 ) {
00110           const char* mid = start + (((end - start) / 2) & ~31);
00111           INT32 middle = *(INT32*) mid;
00112 
00113           if( from >= middle )
00114             start = mid;
00115           else
00116             end = mid;
00117         }
00118 
00119         INT32 front = *(INT32*)start;
00120         assert( from >= front && from < (front + 192) ); 
00121 
00122         return start;
00123       }
00124 
00125       int _bitsSet( unsigned char c ) {
00126         static int bset[] = { 0, 1, 1, 2,   // 0, 1, 2, 3
00127                               1, 2, 2, 3,   // 4, 5, 6, 7
00128                               1, 2, 2, 3,   // 8, 9, A, B
00129                               2, 3, 3, 4 }; // C, D, E, F
00130 
00131         return bset[ c & 0xf ] + bset[ (c>>4) ];
00132       }
00133 
00134     public:
00135       TermBitmap() {
00136         _lastFrom = -1;
00137         _toBase = -10000;
00138         _fromBase = -10000;
00139       }
00140 
00141       ~TermBitmap() {
00142         delete_vector_contents( _maps );
00143       }
00144 
00145       int lastFrom() {
00146         return _lastFrom;
00147       }
00148 
00149       void add( int to ) {
00150         add( _lastFrom+1, to );
00151       }
00152 
00153       void add( int from, int to ) {
00154         assert( _lastFrom < from );
00155 
00156         const int availableSpace = ((32 - 8) * 8);
00157         int difference = to - _toBase;
00158 
00159         assert( difference >= 0 );
00160 
00161         if( difference >= availableSpace || _lastFrom < from-1 ) {
00162           _addBufferIfNecessary();
00163 
00164           // get the current buffer
00165           indri::utility::Buffer* back = _maps.back();
00166           _current = back->write( 32 );
00167 
00168           *(INT32*)_current = from;
00169           *(INT32*)(_current + 4) = to;
00170           memset( _current + 8, 0, (32 - 8) ); 
00171 
00172           _toBase = to;
00173           _fromBase = from;
00174           difference = 0;
00175         }
00176 
00177         _current[8+difference/8] |= 1<<(difference%8);
00178         _lastFrom = from;
00179 
00180         assert( get(from) == to );
00181       }
00182 
00183       int get( int from ) {
00184         assert( from <= _lastFrom );
00185 
00186         // first, binary search through the buffers themselves
00187         indri::utility::Buffer* buffer = _findBuffer( from );
00188         const char* spot = _findInBuffer( buffer, from );
00189 
00190         // now, we have to scan through this buffer to find the right value
00191         int fromBase = *(INT32*)spot;
00192         int toBase = *(INT32*)(spot+4);
00193         spot += 8;
00194         int bits = 0;
00195         int found = 0;
00196 
00197         // first, go byte by byte
00198         unsigned char c;
00199         int need = from - fromBase + 1; // number of bits we need to find (plus 1 for the zero bit)
00200 
00201         c = spot[bits/8];
00202         int byteBits = _bitsSet( c );
00203 
00204         while( found + byteBits < need ) {
00205           bits += 8;
00206           found += byteBits;
00207           c = spot[bits/8];
00208           byteBits = _bitsSet( c );
00209         }
00210 
00211         // now, examine each bit
00212         int i;
00213         for( i=0; i<8; i++ ) {
00214           if( c & 1<<i ) {
00215             found++;
00216 
00217             if( found == need )
00218               break;
00219           }
00220         }
00221 
00222         bits += i;
00223         assert( (fromBase + found - 1) == from );
00224         return toBase + bits;
00225       }
00226 
00227       size_t memorySize() {
00228         return _maps.size() * (64*1024);
00229       }
00230     };
00231   }
00232 }
00233 
00234 #endif // INDRI_TERMBITMAP_HPP

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4