Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

DirichletTermScoreFunction.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // DirichletTermScoreFunction
00015 //
00016 // 26 January 2004 - tds
00017 //
00018 
00019 #ifndef INDRI_DIRICHLETTERMSCOREFUNCTION_HPP
00020 #define INDRI_DIRICHLETTERMSCOREFUNCTION_HPP
00021 
00022 #include <math.h>
00023 namespace indri
00024 {
00026   namespace query
00027   {
00028     
00029     class DirichletTermScoreFunction : public TermScoreFunction {
00030     private:
00031       double _mu;
00032       double _docmu;
00033       double _collectionFrequency;
00034       double _muTimesCollectionFrequency;
00035 
00036     public:
00037       DirichletTermScoreFunction( double mu, double collectionFrequency, double docmu=-1.0 ) {
00038         _collectionFrequency = collectionFrequency;
00039         _mu = mu;
00040         _muTimesCollectionFrequency = _mu * _collectionFrequency;
00041         _docmu = docmu;
00042       }
00043 
00044       double scoreOccurrence( double occurrences, int contextSize ) {
00045         double seen = ( double(occurrences) + _muTimesCollectionFrequency ) / ( double(contextSize) + _mu );
00046         return log( seen );
00047       }
00048 
00049       double scoreOccurrence( double occurrences, int contextSize, double documentOccurrences, int documentLength ) {
00050 //two level Dir Smoothing!
00051 //        tf_E + documentMu*P(t|D)
00052 //P(t|E)= ------------------------
00053 //         extentlen + documentMu
00054 //                 mu*P(t|C) + tf_D
00055 //where P(t|D)= ---------------------
00056 //                  doclen + mu
00057         // if the _docmu parameter is the default, do collection level
00058         // smoothing only.
00059         if (_docmu < 0)
00060           return scoreOccurrence(occurrences, contextSize);
00061         else {
00062           double seen = (occurrences+_docmu*(_muTimesCollectionFrequency+documentOccurrences)/(double(documentLength)+_mu))/(double(contextSize)+_docmu);
00063           return log(seen);
00064         }
00065       }
00066     };
00067   }
00068 }
00069 
00070 #endif // INDRI_DIRICHLETTERMSCOREFUNCTION_HPP

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4