Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

PassageRep.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 #ifndef _PASSAGEREP_HPP
00013 #define _PASSAGEREP_HPP
00014 
00015 #include <vector>
00016 #include "MatchInfo.hpp"
00017 
00018 namespace lemur 
00019 {
00020   namespace retrieval 
00021   {
00022     
00024     struct PassageScore {
00026       int id;
00028       int start;
00030       int end;
00032       double score;
00033     };
00034 
00036     class PassageScoreVector : public vector<PassageScore> {
00037     public:
00038       PassageScoreVector() : vector<PassageScore>() {
00039       }
00041       void sortScores() {
00042         sort(this->begin(), this->end(), cmpFn);
00043       }
00044     private:
00045       class PassageScoreDescending { 
00046       public: 
00047         bool operator()(const PassageScore & a, const PassageScore & b) {
00048           return a.score > b.score;
00049         }
00050       };
00051       static PassageScoreDescending cmpFn;
00052     };
00054 
00060     class PassageRep : public lemur::api::DocumentRep {
00061     public:
00067       PassageRep(lemur::api::DocumentRep &dRep, int d, int p, int o) :
00068         lemur::api::DocumentRep(dRep.getID(), p),
00069         docRep(dRep), docEnd(d), psgSize(p), overlap(o) {
00070         // update encapsulated rep.
00071         docRep.setDocLength(docLength);
00072       }
00073 
00074 #if 0
00075       PassageRep(): DocumentRep(0, 0), docRep(*this) { 
00076       }
00077 #endif
00078 
00079 #if 0
00080 
00081       void startPassageIteration() {
00082         start = 0;
00083         end = psgSize < docEnd ? psgSize : docEnd;
00084       }
00086       bool hasMorePassage() {
00087         return(start < docEnd);
00088       }
00090       void nextPassage() {
00091         int next = start + (end - overlap);
00092         if(next < docEnd)
00093           start = next;
00094         else
00095           start = docEnd;
00096         end = (start + psgSize) < docEnd ? (start + psgSize) : docEnd;
00097         docLength = end - start; // adjust for shorter last passage.
00098         // update encapsulated rep.
00099         docRep.setDocLength(docLength);
00100       }
00101 #endif
00102 
00103       class iterator {
00104       public:
00105         iterator() : start(0), end(0), psgSize(0), overlap(0), docEnd(0),
00106                      rep(NULL) {};
00107         iterator(int s, int e, int p, int o, int d, PassageRep *r) : start(s), 
00108                                                                      end(e), 
00109                                                                      psgSize(p), 
00110                                                                      overlap(o), 
00111                                                                      docEnd(d) {
00112           rep = new PassageRep(*r);
00113         };
00114         // risk of double deletes when copying?
00115         virtual ~iterator() { 
00116           delete(rep);
00117         };
00119         virtual PassageRep &operator*(){ return *rep;};
00120 
00121         virtual iterator& operator++(){
00122           int next = start + (end - overlap);
00123           if(next < docEnd)
00124             start = next;
00125           else
00126             start = docEnd;
00127           end = (start + psgSize) < docEnd ? (start + psgSize) : docEnd;
00128           int docLength = end - start; // adjust for shorter last passage.
00129           // update encapsulated rep. // fix this!
00130           rep->setEnd(start, end, docLength);
00131           return *this;
00132         };
00133 
00134         virtual iterator& operator++(int){  // ++foo semantics here?
00135           int next = start + (end - overlap);
00136           if(next < docEnd)
00137             start = next;
00138           else
00139             start = docEnd;
00140           end = (start + psgSize) < docEnd ? (start + psgSize) : docEnd;
00141           int docLength = end - start; // adjust for shorter last passage.
00142           // update encapsulated rep. // fix this!
00143           rep->setEnd(start, end, docLength);
00144           return *this;
00145         }; 
00147         virtual bool operator==(iterator& other)
00148         {
00149           return (other.start == start && other.end == end);
00150         };
00152         virtual bool operator!=(iterator& other)
00153         {
00154           return !(other.start == start && other.end == end);
00155         };
00156       protected:
00158         PassageRep *rep;
00160         int psgSize;
00162         int overlap;
00164         int docEnd;
00166         int start;
00168         int end;
00169       };
00170   
00171       // could trim start, end attributes.
00172       PassageRep::iterator begin() {
00173         start = 0;
00174         pEnd = psgSize < docEnd ? psgSize : docEnd;
00175         // need to keep the state in the iterator only.
00176         PassageRep::iterator retval(0, pEnd, psgSize, overlap, docEnd, this);
00177         return retval;
00178       }
00179 
00180       PassageRep::iterator end() {
00181         PassageRep::iterator retval (docEnd, docEnd, psgSize, overlap, docEnd, 
00182                                      this);
00183         return retval;
00184       }
00186       void setEnd(int s, int e, int dl) {
00187         start = s;
00188         pEnd = e;
00189         docRep.setDocLength(dl);
00190       }
00197       int passageTF(lemur::api::TERMID_T tid, lemur::api::MatchInfo *matches) const {
00198         int tf = 0;
00199         int pos = 0;
00200         lemur::api::MatchInfo::iterator m = matches->begin();
00201         while (m != matches->end() && pos < pEnd) {
00202           lemur::api::TMatch match = *m;
00203           pos = match.position;
00204           // adjust for stopwords not counted. Bleah
00205           if (pos > docEnd) docEnd = pos + 1;
00206           if (match.tid == tid) {
00207             if (pos >= start && pos < pEnd) {
00208               tf++;
00209             }
00210           }
00211           m++;
00212         }
00213         return tf;
00214       }
00216       int getStart () const {return start;}
00218       int getEnd () const {return pEnd;}
00219 
00221       virtual double termWeight(lemur::api::TERMID_T termID, const lemur::api::DocInfo *info) const {
00222         return docRep.termWeight(termID, info);
00223       }
00224   
00226       virtual double scoreConstant()  const {
00227         return docRep.scoreConstant();
00228       }
00229   
00230 
00231     protected:
00233       lemur::api::DocumentRep &docRep;
00235       int psgSize;
00237       int overlap;
00239       mutable int docEnd;
00241       mutable int start;
00243       mutable int pEnd;
00244     };
00245   }
00246 }
00247 
00248 #endif /* _PASSAGEREP_HPP */
00249 
00250 
00251 

Generated on Tue Jun 15 11:02:55 2010 for Lemur by doxygen 1.3.4