Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

ContextSimpleCountCollectorCopier.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // ContextSimpleCountCollectorCopier
00015 //
00016 // 5 March 2004 -- tds
00017 //
00018 // This copier uses a IndriIndex to extract context
00019 // counts for certain simple subgraphs.  It can compute
00020 // counts for the following types of expressions:
00021 //
00022 //   dog
00023 //   <dog cat>
00024 //   dog.title
00025 //   <dog cat>.title
00026 //   dog.(title)
00027 //   <dog cat>.(title)
00028 //   
00029 // Notably, it is unable to compute counts when more than
00030 // one field is involved.
00031 //
00032 
00033 #ifndef INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP
00034 #define INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP
00035 
00036 #include "indri/QuerySpec.hpp"
00037 #include "indri/Copier.hpp"
00038 #include "indri/delete_range.hpp"
00039 #include "indri/Repository.hpp"
00040 
00041 namespace indri
00042 {
00043   namespace lang
00044   {
00045     class ContextSimpleCountCollectorCopier : public indri::lang::Copier {
00046     private:
00047       std::vector<indri::lang::Node*> _newNodes;
00048       indri::collection::Repository& _repository;
00049 
00050       class SubtreeWalker : public indri::lang::Walker {
00051       private:
00052         bool _computable;
00053         bool _hasContext;
00054 
00055         std::vector<indri::lang::IndexTerm*> _terms;
00056         indri::lang::Field* _field;
00057 
00058       public:
00059         SubtreeWalker() :
00060           _computable(true),
00061           _field(0)
00062         {
00063         }
00064 
00065         bool isComputable() {
00066           return _computable && _terms.size();
00067         }
00068 
00069         std::vector<indri::lang::IndexTerm*>& getTerms() {
00070           return _terms;
00071         }
00072 
00073         indri::lang::Field* getField() {
00074           return _field;
00075         }
00076 
00077         bool hasContext() const {
00078           return _hasContext;
00079         }
00080 
00081         void defaultBefore( indri::lang::Node* node ) {
00082           // this means that we're seeing some node type that
00083           // we aren't otherwise trapping--that means this subtree
00084           // is surely not precomputable
00085           _computable = false;
00086         }
00087 
00088         void before( indri::lang::ContextCounterNode* contextNode ) {
00089           // if the context node has a context, then it must have a field in the context
00090           // if we find more than one field, we say this isn't computable.  Therefore, if
00091           // this subtree is computable and it has a context, the single field must be in the context.
00092           _hasContext = contextNode->getContext() ? true : false;
00093         }
00094     
00095 
00096         void before( indri::lang::ExtentAnd* extentAndNode ) {
00097           // we definitely can't deal with any "true" extentAnds
00098           // however, if this is just an and wrapper around a single
00099           // field, we won't let it fool us
00100           if( extentAndNode->getChildren().size() > 1 )
00101             _computable = false;
00102         }
00103 
00104         void before( indri::lang::Field* fieldNode ) {
00105           if( _field ) {
00106             // fields can't be or-ed together; only terms can (_extentOr)
00107             // If we already saw a field, then this one proves that the tree isn't computable (_field)
00108             _computable = false;
00109           }
00110 
00111           _field = fieldNode;
00112         }
00113 
00114         void before( indri::lang::IndexTerm* termNode ) {
00115           _terms.push_back(termNode);
00116         }
00117 
00118         void before( indri::lang::ExtentInside* insideNode ) {
00119           // ignore this; the other checks should catch any bad trees
00120           // without having to worry about checking here
00121         }
00122       };
00123 
00124     public:
00125       ContextSimpleCountCollectorCopier( indri::collection::Repository& repository ) :
00126         _repository(repository)
00127       {
00128       }
00129 
00130       ~ContextSimpleCountCollectorCopier() {
00131         indri::utility::delete_vector_contents<indri::lang::Node*>( _newNodes );
00132       }
00133 
00134       indri::lang::Node* defaultAfter( indri::lang::Node* oldNode, indri::lang::Node* newNode ) {
00135         _newNodes.push_back( newNode );
00136         return newNode;
00137       }
00138 
00139       indri::lang::Node* after( indri::lang::ContextCounterNode* contextNode, indri::lang::ContextCounterNode* newNode ) {
00140         // first, walk the subtree to find out if it's computable
00141         SubtreeWalker subtree;
00142         contextNode->walk(subtree);
00143         indri::lang::Node* result = newNode;
00144   
00145         if( subtree.isComputable() ) {
00146           // terms
00147           std::vector<std::string> terms;
00148           for( size_t i=0; i<subtree.getTerms().size(); i++ ) {
00149             indri::lang::IndexTerm* indexTerm = subtree.getTerms()[i];
00150             std::string term;
00151             
00152             if( indexTerm->getStemmed() == false )
00153               term = _repository.processTerm( indexTerm->getText() );
00154             else
00155               term = indexTerm->getText();
00156 
00157             terms.push_back( term );
00158           }
00159 
00160           std::string field;
00161           std::string context;
00162 
00163           if( subtree.hasContext() ) {
00164             context = subtree.getField()->getFieldName();
00165           } else if( subtree.getField() ) {
00166             field = subtree.getField()->getFieldName();
00167           }
00168 
00169           result = new indri::lang::ContextSimpleCounterNode( terms, field, context );
00170           result->setNodeName( contextNode->nodeName() );
00171           delete newNode;
00172         }
00173 
00174         // if it wasn't computable, keep the subtree around so the
00175         // inference network code can run it and figure out the counts
00176         _newNodes.push_back( result );
00177         return result;
00178       }
00179     };
00180   }
00181 }
00182 
00183 #endif // INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP
00184 

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4