Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

FreqCounter.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 #ifndef _FREQCOUNTER_HPP
00013 #define _FREQCOUNTER_HPP
00014 
00015 #include <map>
00016 #include <set>
00017 #include "InvFPTypes.hpp"
00018 #include "TextHandler.hpp"
00019 #include "Stopper.hpp"
00020 namespace lemur 
00021 {
00022   namespace distrib 
00023   {
00024     
00026 #define R_CTF 0
00027 
00028 #define R_DF 1
00029 
00030 #define R_AVE_TF 2
00031 
00032 #define R_UNIFORM 3
00033 
00035     typedef struct freqinfo_tt {
00036       char * word;
00037       int ctf;
00038       int df;
00039     } freqinfo_t;
00040 
00042     typedef map<std::string , freqinfo_t, less<std::string> > freqmap;
00044     typedef set<std::string , less<std::string> > stringset;
00045 
00046 
00052     class FreqCounter : public lemur::api::TextHandler {
00053 
00054     public:
00057       FreqCounter(const lemur::api::Stopper * stopWords = NULL);
00060       FreqCounter(const string &filename, const lemur::api::Stopper * stopWords = NULL);
00061   
00063       ~FreqCounter();
00064 
00066       void clear();
00067 
00069       void output(const string &filename) const;
00070 
00073       char * randomWord();
00080       void setRandomMode(int mode);
00082       int getRandomMode() const;
00083 
00086       char * randomCtf() const;
00089       char * randomDf() const;
00092       char * randomAveTf() const;
00095       char * randomUniform() const;
00096 
00097 
00099       int numWords() const;
00101       int totWords() const;
00102 
00104       const freqmap * getFreqInfo() const;
00105 
00107       int getCtf(const char * word) const;
00109       int getDf(const char * word) const;
00111       double getAveTf(const char * word) const;
00112 
00114       double ctfRatio(FreqCounter & lm1) const;
00115 
00117       char * handleDoc(char * docno);
00119       char * handleWord(char * word);
00120 
00122       void endDoc();
00123 
00125       void setName(const string &freqCounterName);
00127       const string & getName() const;
00128 
00130       void pruneBottomWords(int topWords);
00131   
00132 
00133     protected:
00134       /* Loads a language model from file. */
00135       void input(const string &filename);
00136 
00137       /* Collection term frequencies. */
00138       mutable freqmap freqInfo;
00139 
00140       /* Words in a doc. */
00141       stringset doc;
00142       /* Random words returned so far. */
00143       stringset randdone;
00144 
00145       /* The frequency counter's name. */
00146       string name;
00147 
00148       /* Stopword list */
00149       const lemur::api::Stopper * stopper;
00150 
00151 
00152       /* used for calculating probabilities when
00153        * selecting a random word
00154        */
00155       /* Sum over words of ctf. */
00156       long ctfTot;
00157       /* Sum over words of df. */
00158       int dfTot;  
00159       /* Sum over words of average tf. */
00160       mutable long double avetfTot;
00161       /* Indicates whether avetfTot is valid (true)
00162        * or needs to be recalculated (false). */
00163       mutable bool atfValid;
00164       /* Random selection mode. */
00165       int randomMode;
00166       /* Number of unique words. */
00167       int nWords;
00168 
00169 
00170     };
00171  
00172   }
00173 }
00174 
00175 
00176 #endif

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4