Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

PDict.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 /* 
00014  * dmf 02/2004
00015  */
00016 #ifndef _LEMUR_PDICT_HPP
00017 #define _LEMUR_PDICT_HPP
00018 
00019 #include "common_headers.hpp"
00020 #include "algorithm"
00021 #include "Keyfile.hpp"
00022 #include "File.hpp"
00023 #include "TextHandlerManager.hpp"
00024 namespace lemur 
00025 {
00027   namespace dictionary 
00028   {
00029     
00032     class DictEntry {
00033     public:
00035       DictEntry();
00036 
00038       DictEntry(const string &targ, const string &typ, double pr) : target(targ), type(typ), 
00039                                                                     prob(pr) { }
00040   
00044       char *toBytes(int &numBytes) const;
00045 
00049       int toBytes(char *buffer) const;
00050 
00053       int numBytes() const;
00057       int fromBytes(char *buffer);
00059       bool operator==(const DictEntry & a) const { 
00060         return (target == a.target && type == a.type); 
00061       }
00062       string toString(string delim = ";") const ;
00063     
00065       string target;
00067       string type;
00069       double prob;
00070     };
00071 
00075     class DictEntryFilter {
00076     public:
00079       virtual bool accept(const DictEntry &entry) const = 0;
00080       virtual ~DictEntryFilter() {}
00081     } ;
00082 
00083 
00087     class AllDictEntryFilter : public DictEntryFilter {
00088     public:
00091       bool accept(const DictEntry &entry) const { return true; }
00092     };
00093 
00094 
00098     class ProbDictEntryFilter : public DictEntryFilter {
00099     public:
00102       ProbDictEntryFilter(double thresh = 0.0) : threshold(thresh) {
00103       }
00107       bool accept(const DictEntry &entry) const { return entry.prob > threshold; }
00108     private:
00109       double threshold;
00110     };
00111 
00112 
00116     class TypeDictEntryFilter : public DictEntryFilter {
00117     public:
00120       TypeDictEntryFilter(const string &filtType) : type(filtType) {
00121       }
00125       bool accept(const DictEntry &entry) const { return entry.type == type; }
00126     private:
00127       string type;
00128     };
00129 
00134     class StopwordDictEntryFilter : public DictEntryFilter {
00135     public:
00138       StopwordDictEntryFilter(const string &stopwords) {
00139         stopper = lemur::api::TextHandlerManager::createStopper(stopwords);
00140       }
00144       bool accept(const DictEntry &entry) const { 
00145         return !(stopper->stopWord(entry.target.c_str())); 
00146       }
00147     private:
00148       lemur::api::Stopper *stopper;
00149     };
00150 
00152     class DictEntryVector : public vector<DictEntry> {
00153     public:
00154       DictEntryVector() : vector<DictEntry>() {
00155       }
00156       DictEntryVector(char *buffer, DictEntryFilter *filter);
00158       void sortScores() {
00159         sort(this->begin(), this->end(), cmpFn);
00160       }
00165       bool addEntry(DictEntry &entry, double (*compose)(double, double) = NULL);
00166 
00170       bool removeEntry(DictEntry &entry);
00171 
00175       char *toBytes(int &numBytes) const;
00179 
00183       void toBytes(char *buffer) const;
00184 
00185       void fromBytes(char *buffer, DictEntryFilter *filter);
00186 
00189       int numEntries() const;
00190 
00192       void normalize();
00193   
00194     private:
00195       class DictEntryProbDescending { 
00196       public: 
00197         bool operator()(const DictEntry & a, const DictEntry & b) {
00198           return a.prob > b.prob;
00199         }
00200       };
00201       static DictEntryProbDescending cmpFn;
00202     };
00203 
00205     struct dictStats {
00207       int dictSize;
00209       int sourceSize;
00211       int targetSize;
00212     };
00213 
00217     class PDict {
00218     public:
00220       PDict();
00221   
00223       ~PDict();
00224 
00230       DictEntryVector *getTranslations(const string &term, 
00231                                        DictEntryFilter *filter=NULL) const ;
00236       int numTranslations(const string &term, 
00237                           DictEntryFilter *filter=NULL) const;
00240       int getNumPairs() const;
00241 
00244       int getSourceCount() const;
00245 
00248       int getTargetCount() const ;
00249 
00252       const string &getName() const {return name;}
00253 
00256       bool isUsingCounts() const {return usingCounts;}
00257   
00260       void setUsingCounts(bool val) {usingCounts = val;}
00261   
00267       void add(const string &source, DictEntry &value, 
00268                double (*compose)(double, double) = NULL);
00269 
00273       void remove(const string &source, DictEntry &value);
00274 
00277       void remove(const string &source);
00278 
00283       void write(const string &outputName, const string &delim);
00284 
00294       bool read(const string &dictName, const string &delim, bool counts = false);
00295 
00300       bool open(const string &dictName);
00301 
00306       bool create(const string &dictName);
00307 
00310       void close();
00311   
00314       void normalize();
00315 
00317       void startIteration() {dict.setFirst();}
00318 
00323       DictEntryVector *nextTranslations(string &term, 
00324                                         DictEntryFilter *filter=NULL) const;
00325 
00326     private:
00328       void writeTOC() const;  
00330       bool contains(const string &term, lemur::file::Keyfile &keyfile) const;
00332       void flush();
00334       dictStats stats;
00336       DictEntryVector* currentVec;
00338       bool usingCounts;
00340       string currentTerm;
00342       string name;
00344       mutable lemur::file::Keyfile dict;
00346       mutable lemur::file::Keyfile targetIDs;
00348       mutable lemur::file::File dictEntries;
00349     };
00350   }
00351 }
00352 
00353 #endif // _LEMUR_PDICT_HPP

Generated on Tue Jun 15 11:02:55 2010 for Lemur by doxygen 1.3.4