Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

MMRSumm.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2002 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 #ifndef _MMRSUMM_HPP
00013 #define _MMRSUMM_HPP
00014 
00015 #include <iomanip>
00016 #include "Summarizer.hpp"
00017 #include "Passage.hpp"
00018 #include "MMRPassage.hpp"
00019 #include "Index.hpp"
00020 #include <algorithm>
00021 #include <vector>
00022 #include <string>
00023 using std::string;
00024 using std::vector;
00025 
00026 namespace lemur 
00027 {
00028   namespace summarization 
00029   {
00035     class MMRSumm : public Summarizer {
00036 
00037     private:
00038       double lambda;
00039       const lemur::api::Index* idx;
00040       int summLen;
00041       vector<MMRPassage> doc;
00042       mutable int iterCount;
00043       double maxSims;
00044       MMRPassage* queryPassage;
00045 
00046       int autoMMRQuery(void) {
00047         lemur::api::TermInfo* tEntry;
00048         lemur::api::TermInfoList* tList = idx->termInfoListSeq(idx->document(queryPassage->docID));
00049         // allocating a new object each time leaks
00050         // the vector makes a copy of the object.
00051         // dmf 05/2005
00052         // termCount* storage;
00053         termCount storage;
00054         if (hasTITLE(idx, tList)) {
00055           // use title words
00056           tList->startIteration();
00057           cout << "title found" << endl;
00058           while (tList->hasMore()) {
00059             tEntry = tList->nextEntry();
00060             if ( isTITLE(idx->term(tEntry->termID())) ) {
00061               tEntry = tList->nextEntry(); // the actual word after title token
00062               //              storage = new termCount;
00063               //              storage->termID = tEntry->termID();
00064               //              storage->tf = tEntry->count();
00065               //              storage->val = tEntry->count();
00066               //              queryPassage->addTerm(*storage);
00067               storage.termID = tEntry->termID();
00068               storage.tf = tEntry->count();
00069               storage.val = tEntry->count();
00070               queryPassage->addTerm(storage);
00071             }
00072           }      
00073         } else {
00074           tList->startIteration();
00075           for (int i=0; i<10; i++) {
00076             if (tList->hasMore()) {
00077               tEntry = tList->nextEntry();
00078               //              storage = new termCount;
00079               //              storage->termID = tEntry->termID();
00080               //              storage->tf = tEntry->count();
00081               //              storage->val = tEntry->count();
00082               //              queryPassage->addTerm(*storage);
00083               storage.termID = tEntry->termID();
00084               storage.tf = tEntry->count();
00085               storage.val = tEntry->count();
00086               queryPassage->addTerm(storage);
00087             }
00088           } 
00089         }
00090         cout << "Autoquery: ";
00091         showPassage((*queryPassage).getAsVector(), idx);
00092         cout << endl;
00093 
00094         return 1;
00095       }
00096 
00097       int setMMRQuery(const string &qInfo) {
00098         if (qInfo != "") {
00099         // allocating a new object each time leaks
00100         // the vector makes a copy of the object.
00101         // dmf 05/2005
00102           //          termCount* storage;
00103           //          storage = new termCount;
00104           //          storage->termID = idx->term(qInfo);
00105           //          storage->tf = 1;
00106           //          storage->val = 1;
00107           //          queryPassage->addTerm(*storage);
00108           termCount storage;
00109           storage.termID = idx->term(qInfo);
00110           storage.tf = 1;
00111           storage.val = 1;
00112           queryPassage->addTerm(storage);
00113           return 1;
00114         }
00115         return autoMMRQuery();
00116       }
00117 
00118     public:
00119 
00120       MMRSumm(const lemur::api::Index* inIdx, int inSummLen = 5) :
00121         idx(inIdx), summLen(inSummLen), iterCount(1), maxSims(-1.0),
00122         queryPassage(NULL), lambda(1.0) {};
00123   
00124       virtual void markPassages(int optLen, const string &qInfo);
00125 
00126       virtual void addPassage(Passage &psg);
00127 
00128       void addDocument(const string &docID);
00129 
00130       virtual int fetchPassages(Passage* psgs, int optLen) const;
00131   
00132       virtual void summDocument(const string &docID, const int optLen, const string &qInfo);
00133 
00134       virtual void scorePassages(const string &qInfo);
00135 
00136       virtual void clear(void);
00137 
00138       virtual int nextPassage(Passage* psg) const;
00139 
00140       virtual void iterClear(void) const;
00141 
00142       virtual void outputSumm(void) const;
00143 
00144       void findNextPassage(MMRPassage &psg, 
00145                            const lemur::api::Index* idx, 
00146                            const lemur::api::TermInfoList* tList, int eos);
00147 
00148       void showPassage(const passageVec* psg, 
00149                        const lemur::api::Index* idx) const;
00150   
00151       void showMarkedPassages() const ;
00152 
00153       int isEOS(const string &check) {
00154         return (check == EOS);
00155       }
00156   
00157       int hasEOS(const lemur::api::Index* idx, 
00158                  const lemur::api::TermInfoList* tList) {
00159         tList->startIteration();
00160         lemur::api::TermInfo* tEntry;
00161         while (tList->hasMore()) {
00162           tEntry = tList->nextEntry();
00163           if ( isEOS(idx->term(tEntry->termID())) ) return true;
00164         }
00165         return false;
00166       }
00167   
00168       int isTITLE(const string & check) {
00169         //    return !strcmp(check, TITLE);
00170         return (check == TITLE);
00171       }
00172   
00173       int hasTITLE(const lemur::api::Index* idx, 
00174                    const lemur::api::TermInfoList* tList) {
00175         tList->startIteration();
00176         lemur::api::TermInfo* tEntry;
00177         while (tList->hasMore()) {
00178           tEntry = tList->nextEntry();
00179           if ( isTITLE(idx->term(tEntry->termID())) ) return true;
00180         }
00181         return false;
00182       }
00183   
00184       int isPRONOUN(const string &check) {
00185         return (check == PRONOUN);
00186       }
00187   
00188       struct compareSW {
00189         double lambda;
00190         compareSW(double l) { lambda = l; }
00191         bool operator()(const MMRPassage p1, const MMRPassage p2) const {
00192           return p1.computeMMR(lambda) > p2.computeMMR(lambda);
00193         }
00194       };
00196       static const string TITLE;
00198       static const string PRONOUN;
00199     }; // MMRSumm
00200   }
00201 }
00202 
00203 #endif

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4