Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

RVLCompress.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2002 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 #ifndef _RVLCOMPRESS_HPP
00013 #define _RVLCOMPRESS_HPP
00014 
00015 #include "lemur-platform.h"
00016 namespace lemur 
00017 {
00018   namespace utility 
00019   {
00020     
00021 #define pow2_7  128
00022 #define pow2_14 16384
00023 #define pow2_21 2097152
00024 #define pow2_28 268435456
00025 
00026 #define pow2_31 2147483648U
00027 
00028 #define RVL_COMPRESS_MASK                      ((1<<7)-1)
00029 #define RVL_COMPRESS_TERMINATE_BIT             (1<<7)
00030 #define RVL_COMPRESS_BYTE( d, in, b )          d[b] = (char) ((in >> 7*b) & ((1<<7)-1))
00031 #define RVL_COMPRESS_TERMINATE( d, in, b )     d[b] = (char) ((in >> 7*b) | (1<<7))
00032 
00033     class RVLCompress {
00034     public:
00036       static int compress_ints (int *data_ptr, unsigned char *out_ptr, int size);
00037 
00039       static int decompress_ints(unsigned char *data_ptr, int *out_ptr, int num_bytes);
00040 
00042       static int compressedSize( int data );
00044       static int compressedSize( INT64 data );
00046       static int compressedSize( UINT64 data );
00047 
00048       static UINT64 foldNegatives( INT64 number );
00049       static INT64 unfoldNegatives( UINT64 number );
00050 
00051       static char* compress_int( char* dest, int data );
00052       static char* compress_longlong( char* dest, UINT64 data );
00053       static char* compress_longlong( char* dest, INT64 data );
00054       static const char* decompress_int( const char* source, int& data );
00055       static const char* decompress_longlong( const char* source, UINT64& data );
00056       static const char* decompress_longlong( const char* source, INT64& data );
00057       static const char* decompress_int_count( const char* source, int* result, int numInts );
00058       static const char* skip_ints( const char* source, int numInts );
00059 
00060     private:
00061       static char* _compress_bigger_int( char* dest, int data );
00062       static char* _compress_bigger_longlong( char* dest, UINT64 data );
00063     };
00064 
00065     inline UINT64 RVLCompress::foldNegatives( INT64 number ) {
00066       // fold negative numbers into positive ones, use low bit as negative sign
00067       UINT64 folded;
00068 
00069       if( number < 0 )
00070         folded = (2 * -number) - 1;
00071       else
00072         folded = 2 * number;
00073 
00074       return folded;
00075     }
00076 
00077     inline INT64 RVLCompress::unfoldNegatives( UINT64 number ) {
00078       INT64 unfolded;
00079 
00080       if( number & 1 ) {
00081         // number is negative
00082         unfolded = -INT64((number + 1) / 2);
00083       } else {
00084         // number is positive
00085         unfolded = number / 2;
00086       }
00087 
00088       return unfolded;
00089     }
00090 
00091     inline int RVLCompress::compressedSize( int data ) {
00092       return compressedSize( UINT64( data ) );
00093     }
00094 
00095     inline int RVLCompress::compressedSize( INT64 data ) {
00096       return compressedSize( foldNegatives( data ) );
00097     }
00098 
00099     inline const char* RVLCompress::decompress_int( const char* source, int& data ) {
00100       const unsigned int terminator = (1<<7);
00101       const unsigned int mask = ((1<<7)-1);
00102 
00103       if( source[0] & terminator ) {
00104         data = source[0] & mask;
00105         return source + 1;
00106       } else if ( source[1] & terminator ) {
00107         data = (source[0])       | 
00108           ((source[1]&mask) << 7);
00109         return source + 2;
00110       } else if ( source[2] & terminator ) {
00111         data = (source[0])       | 
00112           (source[1] << 7)  |
00113           ((source[2]&mask) << 14);
00114         return source + 3;
00115       } else if ( source[3] & terminator ) {
00116         data = (source[0])       | 
00117           (source[1] << 7)  |
00118           (source[2] << 14) |
00119           ((source[3]&mask) << 21);
00120         return source + 4;
00121       } else {
00122         data = (source[0])       | 
00123           (source[1] << 7)  |
00124           (source[2] << 14) |
00125           (source[3] << 21) |
00126           ((source[4]&mask) << 28);  
00127         return source + 5;
00128       }
00129     }
00130 
00131     inline const char* RVLCompress::decompress_longlong( const char* source, UINT64& data ) {
00132       const unsigned int terminator = (1<<7);
00133       const unsigned int mask = ((1<<7)-1);
00134       unsigned int i;
00135 
00136       data = 0;
00137 
00138       for( i=0; i<10; i++ ) {
00139         if( source[i] & terminator ) {
00140           data |= (UINT64(source[i] & mask) << 7*i);
00141           break;
00142         } else {
00143           data |= (UINT64(source[i]) << 7*i);
00144         }
00145       }
00146 
00147       return source + i + 1;
00148     }
00149 
00150     inline const char* RVLCompress::decompress_int_count( const char* source, int* result, int numInts ) {
00151       const char* ptr = source;
00152 
00153       for( int i=0; i<numInts; i++ ) {
00154         ptr = decompress_int( ptr, result[i] );
00155       }
00156 
00157       return ptr;
00158     }
00159 
00160     inline const char* RVLCompress::skip_ints( const char* source, int numInts ) {
00161       while( numInts-- ) {
00162         while( !(*source & 0x80) )
00163           source++;
00164         source++;
00165       }
00166       return source;
00167     }
00168 
00169     inline int RVLCompress::compressedSize( UINT64 data ) {
00170       if( data < pow2_7 ) {
00171         return 1;
00172       } else if ( data < pow2_14 ) {
00173         return 2;
00174       } else if ( data < pow2_21 ) {
00175         return 3;
00176       } else if ( data < pow2_28 ) {
00177         return 4;
00178       } else if ( data < UINT64(1)<<35 ) {
00179         return 5;
00180       } else if ( data < UINT64(1)<<42 ) {
00181         return 6;
00182       } else if ( data < UINT64(1)<<49 ) {
00183         return 7;
00184       } else if ( data < UINT64(1)<<56 ) {
00185         return 8;
00186       } else if ( data < UINT64(1)<<63 ) {
00187         return 9;
00188       } else {
00189         return 10;
00190       }
00191     }
00192 
00193     inline char* RVLCompress::compress_int( char* dest, int data ) {
00194       if( data < (1<<7) ) {
00195         RVL_COMPRESS_TERMINATE( dest, data, 0 );
00196         return dest + 1;
00197       } else if( data < (1<<14) ) {
00198         RVL_COMPRESS_BYTE( dest, data, 0 );
00199         RVL_COMPRESS_TERMINATE( dest, data, 1 );
00200         return dest + 2;
00201       } else {
00202         return _compress_bigger_int( dest, data );
00203       }
00204     }
00205 
00206     inline char* RVLCompress::compress_longlong( char* source, INT64 data ) {
00207       UINT64 number;
00208       number = foldNegatives( data );
00209       return compress_longlong( source, number );
00210     }
00211 
00212     inline char* RVLCompress::compress_longlong( char* dest, UINT64 data ) {
00213       if( data < (UINT64(1)<<7) ) {
00214         RVL_COMPRESS_BYTE( dest, data, 0 );
00215         RVL_COMPRESS_TERMINATE( dest, data, 0 );
00216         return dest + 1;
00217       } else if( data < (UINT64(1)<<14) ) {
00218         RVL_COMPRESS_BYTE( dest, data, 0 );
00219         RVL_COMPRESS_TERMINATE( dest, data, 1 );
00220         return dest + 2;
00221       } else {
00222         return _compress_bigger_longlong( dest, data );
00223       }
00224     }
00225 
00226     inline const char* RVLCompress::decompress_longlong( const char* source, INT64& data ) {
00227       UINT64 number;
00228       source = decompress_longlong( source, number );
00229       data = unfoldNegatives( number );
00230       return source;
00231     }
00232   }
00233 }
00234 
00235 #endif

Generated on Tue Jun 15 11:02:55 2010 for Lemur by doxygen 1.3.4