Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

TFIDFTermScoreFunction.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // TFIDFScoreFunction
00015 //
00016 // 23 January 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_TFIDFTERMSCOREFUNCTION_HPP
00020 #define INDRI_TFIDFTERMSCOREFUNCTION_HPP
00021 #include <iostream>
00022 #include "indri/TermScoreFunction.hpp"
00023 #include <math.h>
00024 namespace indri
00025 {
00026   namespace query
00027   {
00028     
00029     class TFIDFTermScoreFunction : public TermScoreFunction {
00030     private:
00032       double _inverseDocumentFrequency; 
00034       double _averageDocumentLength;
00035 
00036       double _termWeight;
00037 
00038       // These are BM25 parameters
00039       double _k1;
00040       double _b;
00041       // okapi query term weight parameter
00042       double _k3;
00043       
00044       // The following values are precomputed so that score computation will go faster
00045       double _bOverAvgDocLength;
00046       double _k1TimesOneMinusB;
00047       double _idfTimesK1PlusOne;
00048       double _k1TimesBOverAvgDocLength;
00049       double _termWeightTimesIDFTimesK1;
00050       double _termWeightTimesidfTimesK1PlusOne;
00051       bool _okapi;
00052       
00053       void _precomputeConstants() {
00054         _idfTimesK1PlusOne = _inverseDocumentFrequency * ( _k1 + 1 );
00055         _k1TimesOneMinusB = _k1 * (1-_b);
00056         _bOverAvgDocLength = _b / _averageDocumentLength;
00057         _k1TimesBOverAvgDocLength = _k1 * _bOverAvgDocLength;
00058         _termWeightTimesIDFTimesK1 = _termWeight * _inverseDocumentFrequency * _k1; 
00059         _termWeightTimesidfTimesK1PlusOne = _termWeight * _idfTimesK1PlusOne;
00060       }
00061 
00062     public:
00063       TFIDFTermScoreFunction( double idf, double averageDocumentLength, int qTF = 1, double k1 = 1.2, double b = 0.75, bool okapi = false, double k3 = 7 ) {
00064         _okapi = okapi;
00065         _inverseDocumentFrequency = idf;
00066         _averageDocumentLength = averageDocumentLength;
00067 
00068         _k1 = k1;
00069         _b = b;
00070         _k3 = k3;
00071         // needs to be adjusted to _termWeight/_qTF to enable additive
00072         // scoring of terms when _qTF > 1 to get the values correct.
00073         _termWeight = queryTermWeight( 1000, 0, qTF ) / qTF;
00074         _precomputeConstants();
00075       }
00076 
00077       TFIDFTermScoreFunction( double idf, double averageDocumentLength, double qtw = 1.0, double k1 = 1.2, double b = 0.75, bool okapi = false, double k3 = 7 ) {
00078         _okapi = okapi;
00079         _inverseDocumentFrequency = idf;
00080         _averageDocumentLength = averageDocumentLength;
00081 
00082         _k1 = k1;
00083         _b = b;
00084         _k3 = k3;
00085         
00086         // if the weight is supplied, don't recompute it
00087         _termWeight = qtw;
00088         //        _termWeight = queryTermWeight( 1000, 0, qtw );
00089         _precomputeConstants();
00090       }
00091 
00092       double scoreOccurrence( double occurrences, int documentLength ) {
00093         if (_okapi) {
00094             
00095           // okapi
00096           //
00097           // Score function is:
00098           //                                                   (K1 + 1) * occurrences
00099           // score = termWeight * IDF * ------------------------------------------------------------------
00100           //                             occurrences + K1 * ( (1-B) + B * ( documentLength / avgDocLength) )
00101           //
00102           // Factored for constants:
00103           //                        (termWeight * IDF * (K1 + 1)) * occurrences
00104           // score = ------------------------------------------------------------------------
00105           //          occurrences + (K1 * (1-B)) + (K1 * B * 1/avgDocLength) * documentLength
00106           //
00107           double numerator = _termWeightTimesidfTimesK1PlusOne * occurrences;
00108           double denominator = occurrences + _k1TimesOneMinusB + _k1TimesBOverAvgDocLength * documentLength;
00109           return numerator / denominator; 
00110         } else {
00111           //simple tfidf
00112           //
00113           // Score function is:
00114           //                                                   K1 * occurrences
00115           // score = termWeight * IDF * ------------------------------------------------------------------
00116           //                             occurrences + K1 * ( (1-B) + B * ( documentLength / avgDocLength) )
00117           //
00118           // Factored for constants:
00119           //                        (termWeight * IDF * K1) * occurrences
00120           // score = ------------------------------------------------------------------------
00121           //          occurrences + (K1 * (1-B)) + (K1 * B * 1/avgDocLength) * documentLength
00122           //
00123             
00124 
00125           double numerator = _termWeightTimesIDFTimesK1 * occurrences;
00126           double denominator = occurrences + _k1TimesOneMinusB + _k1TimesBOverAvgDocLength * documentLength;
00127           return numerator / denominator;
00128         }
00129         
00130       }
00131       
00132       double scoreOccurrence( double occurrences, int contextSize, double documentOccurrences, int documentLength ) {
00133         return scoreOccurrence(occurrences, contextSize);
00134       }
00135       
00136       double maximumScore( int minimumDocumentLength, int maximumOccurrences ){
00137         return scoreOccurrence( maximumOccurrences, minimumDocumentLength );
00138       }
00139 
00140       double queryTermWeight( double queryK1, double queryB, double _qTF ) {
00141         if (_okapi)
00142           return (((_k3 + 1) * _qTF)/(_k3 + _qTF));
00143         else
00144           // lemur tfidf:
00145           //                  _qTF    queryK1  b |D| |D|_avg
00146           // idf[q] * BM25TF(rawTF,prm.bm25K1, 0, 1,  1)
00147           return ( _inverseDocumentFrequency * queryK1 * _qTF ) / ( _qTF + queryK1 );
00148 
00149           //          return ( _inverseDocumentFrequency * queryK1 ) / ( 1 + queryK1 * ( (1-queryB) + queryB * (1/_averageDocumentLength) ) );
00150       }
00151     };
00152   }
00153 }
00154 
00155 #endif // TFIDF_TERMSCOREFUNCTION_HPP
00156 

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4