Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

UnigramLM.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 #ifndef _UNIGRAMLM_HPP
00014 #define _UNIGRAMLM_HPP
00015 
00016 #include "Counter.hpp"
00017 #include "Exception.hpp"
00018 #include "IndexTypes.hpp"
00019 #include <cstring>
00020 namespace lemur 
00021 {
00022   namespace langmod
00023   {
00024     
00026 
00031     class UnigramLM {
00032     public:
00034       virtual double prob(lemur::api::TERMID_T wordIndex) const = 0;
00036       virtual const string lexiconID() const= 0;
00037 
00039       virtual void startIteration() const = 0;
00040       virtual bool hasMore() const = 0;
00041       virtual void nextWordProb(lemur::api::TERMID_T &wordIndex, double &prob) const = 0;
00042     };
00043 
00044 
00046 
00047     class SmoothedMLEstimator : public UnigramLM {
00048     public:
00049       SmoothedMLEstimator(const lemur::utility::Counter &counter, const string &lexiconID) : ct(counter), lexID(lexiconID) {}
00050       virtual ~SmoothedMLEstimator() {}
00051 
00052       virtual double prob(lemur::api::TERMID_T wordIndex) const {
00053         return (probEstimate(wordIndex, ct.count(wordIndex),ct.sum()));
00054       }
00055 
00056       virtual void startIteration() const {
00057         ct.startIteration();
00058       }
00059 
00060       virtual bool hasMore() const {
00061         return ct.hasMore();
00062       }
00063 
00064       virtual void nextWordProb(lemur::api::TERMID_T &wordIndex, double &prob) const{
00065         double count;
00066         //dmf FIXME
00067         ct.nextCount((int&)wordIndex, count);
00068         prob = probEstimate(wordIndex, count, ct.sum());
00069       }
00070   
00071       virtual const string lexiconID() const { return lexID;}
00072 
00074       virtual double probEstimate(lemur::api::TERMID_T wordIndex, double wdCount, double sumCount) const=0;
00075 
00076     protected:
00077       const lemur::utility::Counter &ct;
00078       const string lexID;
00079     };
00080   
00082 
00083     class MLUnigramLM : public SmoothedMLEstimator { 
00084     public:
00085       MLUnigramLM(const lemur::utility::Counter & counter, const string &lexiconID) : SmoothedMLEstimator(counter, lexiconID) {};
00086       virtual ~MLUnigramLM() {}
00087   
00088       virtual double probEstimate(lemur::api::TERMID_T wordIndex, double count, double sum) const{
00089         return (count/sum);
00090       }
00091     };
00092 
00094     class LaplaceUnigramLM : public SmoothedMLEstimator { 
00095     public:
00096       LaplaceUnigramLM(const lemur::utility::Counter & counter, const string &lexiconID, double vocabSize) : SmoothedMLEstimator(counter, lexiconID), vocSz(vocabSize) {};
00097       virtual ~LaplaceUnigramLM() {}
00098   
00099       virtual double probEstimate(lemur::api::TERMID_T wordIndex, double count, double sum) const {
00100         return ((count+1)/(sum+vocSz));
00101       }
00102     private:
00103       double vocSz;
00104     };
00105 
00106 
00108 
00109     class DirichletUnigramLM : public SmoothedMLEstimator { 
00110     public:
00111       DirichletUnigramLM(const lemur::utility::Counter & counter, const string &lexiconID, 
00112                          const UnigramLM &refLM, double priorSampleSize) 
00113         : SmoothedMLEstimator(counter, lexiconID), ref(&refLM), 
00114           s(priorSampleSize) {}
00115 
00116       virtual ~DirichletUnigramLM() {}
00117   
00118       virtual double probEstimate(lemur::api::TERMID_T wordIndex, double count, double sum) const {
00119         return ((count+s*ref->prob(wordIndex))/(sum+s));
00120       }
00121 
00122     private:
00123       const UnigramLM *ref;
00125       double s;  
00126     };
00127 
00128 
00129 
00130 
00131 
00132 
00134 
00135     class InterpUnigramLM : public SmoothedMLEstimator { 
00136     public:
00137       InterpUnigramLM(const lemur::utility::Counter & counter, const string &lexiconID, 
00138                       const UnigramLM &refLM, double refCoeff) 
00139         : SmoothedMLEstimator(counter, lexiconID), ref(&refLM), 
00140           refC(refCoeff) {}
00141 
00142       virtual ~InterpUnigramLM() {}
00143   
00144       virtual double probEstimate(lemur::api::TERMID_T wordIndex, double count, double sum) const {
00145         return ((1-refC)*count/sum + refC*ref->prob(wordIndex));
00146       }
00147 
00148     private:
00149       const UnigramLM *ref;
00151       double refC;  
00152     };
00153  
00154   }
00155 }
00156 #endif /* _UNIGRAMLM_HPP */

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4