Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

BasicSumm.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2002 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 #ifndef _BASICSUMM_HPP
00012 #define _BASICSUMM_HPP
00013 
00014 #include <iomanip>
00015 #include "lemur-compat.hpp"
00016 #include "Summarizer.hpp"
00017 #include "Passage.hpp"
00018 #include "BasicPassage.hpp"
00019 #include "Index.hpp"
00020 #include <algorithm>
00021 #include <vector>
00022 #include <cmath>
00023 using std::vector;
00024 
00025 namespace lemur 
00026 {
00027   namespace summarization 
00028   {
00033     class BasicSumm : public Summarizer {
00034 
00035     private:
00036       const lemur::api::Index* idx;
00037       int summLen;
00038       vector<BasicPassage> doc;
00039       mutable int iterCount;
00040 
00041     public:
00043       BasicSumm(const lemur::api::Index* inIdx, int inSummLen = 5) :
00044         idx(inIdx), summLen(inSummLen), iterCount(1) {};      
00045 
00046       virtual void summDocument(const string &docID, const int optLen, 
00047                                 const string &qInfo);
00048 
00049       virtual void scorePassages(const string &qInfo);
00050 
00051       virtual void markPassages(int optLen, const string &qInfo);
00052 
00053       virtual void addPassage(Passage &psg);
00054 
00055       virtual void clear(void);
00056 
00057       virtual int fetchPassages(Passage* psgs, int optLen) const;
00058 
00059       virtual int nextPassage(Passage* psg) const;
00060 
00061       virtual void iterClear(void) const ;
00062 
00063       virtual void outputSumm(void) const ;
00064 
00066       int isEOS(const string &check) {
00067         return (check  == EOS);
00068       }
00069 
00071       int hasEOS(const lemur::api::Index* idx, 
00072                  const lemur::api::TermInfoList* tList) {
00073         tList->startIteration();
00074         lemur::api::TermInfo* tEntry;
00075         while (tList->hasMore()) {
00076           tEntry = tList->nextEntry();
00077           if ( isEOS(idx->term(tEntry->termID())) ) return true;
00078         }
00079         return false;
00080       }
00081 
00083       double scorePassage(BasicPassage &psg, const string &qInfo) {
00084         const string &docID = psg.docID;
00085         passageVec psgV= *psg.getAsVector();
00086         double psgLen = psgV.size();
00087         double P = 1;  // no markup yet, all get same weight
00088         double M = 1.5;
00089         double endScore, Tf, tf, idf, docLen, avgDocLen;
00090         endScore = 0.0;
00091         for (int i=0; i < psgLen; i++) {
00092           docLen = idx->docLength(idx->document(docID));
00093           avgDocLen = idx->docLengthAvg();
00094           tf = psgV[i].tf;
00095           Tf = tf / (tf + 0.5 + 1.5 * (docLen/avgDocLen) );
00096           idf = lemur_compat::min(M, 
00097                                   log((double)idx->docCount()/
00098                                       (double)idx->docCount(psgV[i].termID))); 
00099           endScore += (Tf * idf * P);
00100         }
00101         endScore = endScore / 1+psgLen;
00102         psg.score = endScore;
00103         return endScore;
00104       }
00105 
00107       void findNextPassage(BasicPassage &psg, 
00108                            const lemur::api::Index* idx, 
00109                            const lemur::api::TermInfoList* tList, int eos) {
00110         lemur::api::TermInfo* tEntry;
00111         psg.clear();
00112         // allocating a new object each time leaks
00113         // the vector makes a copy of the object.
00114         // dmf 05/2005
00115         //termCount* storage;
00116         termCount storage;
00117         if (eos) {
00118           while (tList->hasMore()) {
00119             tEntry = tList->nextEntry();
00120             if ( isEOS(idx->term(tEntry->termID())) ) return;
00121             //            storage = new termCount;
00122             //storage->termID = tEntry->termID();
00123             //storage->tf = tEntry->count();
00124             //psg.addTerm(*storage);
00125             storage.termID = tEntry->termID();
00126             storage.tf = tEntry->count();
00127             psg.addTerm(storage);
00128           }
00129         } else {
00130           for(int i=0; i < PSG_LEN; i++) {
00131             if (tList->hasMore()) {
00132               tEntry = tList->nextEntry();
00133               //storage = new termCount;
00134               //storage->termID = tEntry->termID();
00135               //storage->tf = tEntry->count();
00136               //psg.addTerm(*storage);
00137               storage.termID = tEntry->termID();
00138               storage.tf = tEntry->count();
00139               psg.addTerm(storage);
00140             } else {
00141               return;
00142             }
00143           }
00144         }
00145         return;
00146       }
00147  
00149       void showPassage(const passageVec* psg, 
00150                        const lemur::api::Index* idx) const {
00151         for (int i=0; i < psg->size(); i++) {
00152           cout << idx->term((*psg)[i].termID) << " ";
00153         }
00154       }
00155 
00157       void showMarkedPassages() const {
00158     
00159         for (int i=0; i<doc.size(); i++) {
00160           if (doc[i].marked > 0) {
00161             showPassage(doc[i].getAsVector(), idx);
00162             cout << endl;
00163           }
00164         }
00165       }
00166 
00167     }; // BasicSumm
00168   }
00169 }
00170 
00171 #endif

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4