Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

SmoothingAnnotatorWalker.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // SmoothingAnnotatorWalker
00015 //
00016 // 27 April 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_SMOOTHINGANNOTATORWALKER_HPP
00020 #define INDRI_SMOOTHINGANNOTATORWALKER_HPP
00021 
00022 #include "indri/Parameters.hpp"
00023 namespace indri
00024 {
00025   namespace lang
00026   {
00027     
00028     class SmoothingAnnotatorWalker : public indri::lang::Walker {
00029     private:
00030       struct rule_type {
00031         std::string node;
00032         std::string field;
00033         std::string op;
00034         std::string smoothing;
00035       };
00036 
00037       std::vector<rule_type*> _rules;
00038       std::string _defaultSmoothing;
00039 
00040       void _loadSmoothingRules( indri::api::Parameters& parameters ) {
00041         if( !parameters.exists("rule") )
00042           return;
00043 
00044         indri::api::Parameters rules = parameters["rule"];
00045 
00046         for(size_t i=0; i<rules.size(); i++) {
00047           std::string ruleText = rules[i];
00048 
00049           int nextComma = 0;
00050           int nextColon = 0;
00051           int location = 0;
00052 
00053           rule_type* rule = new rule_type;
00054           rule->node = "RawScorerNode";
00055           rule->op = "*";
00056           rule->field = "*";
00057 
00058           for( location = 0; location < ruleText.length(); ) {
00059             nextComma = ruleText.find( ',', location );
00060             nextColon = ruleText.find( ':', location );
00061 
00062             std::string key = ruleText.substr( location, nextColon-location );
00063             std::string value = ruleText.substr( nextColon+1, nextComma-nextColon-1 );
00064 
00065             if( key == "node" ) {
00066               rule->node = value;
00067             } else if( key == "field" ) {
00068               rule->field = value;
00069             } else if( key == "operator" ) {
00070               rule->op = value;
00071             }  else {
00072               if( rule->smoothing.size() ) rule->smoothing += ",";
00073               rule->smoothing += key + ":" + value;
00074             }
00075 
00076             if( nextComma > 0 )
00077               location = nextComma+1;
00078             else
00079               location = ruleText.size();
00080           }
00081 
00082           _rules.push_back(rule);
00083         }
00084       }
00085 
00086       const std::string& _matchSmoothingRule( const std::string& node, const std::string& field, const std::string& op ) {
00087         for( int i=signed(_rules.size())-1; i >= 0; i-- ) {
00088           const rule_type& rule = *_rules[i];
00089 
00090           if( ( rule.node == node ) &&
00091               ( rule.field == field || rule.field == "*" ) &&
00092               ( rule.op == op || rule.op == "*" ) ) {
00093             return rule.smoothing;
00094           }
00095         }
00096 
00097         return _defaultSmoothing;
00098       }
00099 
00100     public:
00101       SmoothingAnnotatorWalker( indri::api::Parameters& parameters ) {
00102         _loadSmoothingRules( parameters );
00103         _defaultSmoothing = "method:dirichlet,mu:2500";
00104       }
00105 
00106       ~SmoothingAnnotatorWalker( ) {
00107         indri::utility::delete_vector_contents<rule_type*>( _rules );
00108       }
00109 
00110       void after( indri::lang::RawScorerNode* scorer ) {
00111         indri::lang::Node* context = scorer->getContext();
00112         indri::lang::Field* contextField = dynamic_cast<indri::lang::Field*>(context);
00113         indri::lang::ExtentOr* contextExtOr = dynamic_cast<indri::lang::ExtentOr*>(context);
00114         std::string fieldName;
00115 
00116         // there may be an ExtentOr around the field, so descend into it if necessary
00117         if( contextExtOr && contextExtOr->getChildren().size() == 1 ) {
00118           contextField = dynamic_cast<indri::lang::Field*>(contextExtOr->getChildren()[0]);
00119         }
00120 
00121         // if there's a field here, record its name
00122         if( contextField ) {
00123           fieldName = contextField->getFieldName();
00124         } else {
00125           fieldName = "?";
00126         }
00127     
00128         indri::lang::Node* raw = scorer->getRawExtent();
00129         indri::lang::Node* rawTerm = dynamic_cast<indri::lang::IndexTerm*>(raw);
00130         indri::lang::Node* rawODNode = dynamic_cast<indri::lang::ODNode*>(raw);
00131         indri::lang::Node* rawUWNode = dynamic_cast<indri::lang::UWNode*>(raw);
00132         indri::lang::Node* rawWeightedExtentOr = dynamic_cast<indri::lang::WeightedExtentOr*>(raw);
00133 
00134         std::string op;
00135 
00136         if( rawODNode || rawUWNode ) {
00137           op = "window";
00138         } else if( rawTerm || rawWeightedExtentOr ) {
00139           op = "term";
00140         } else {
00141           op = "?";
00142         }
00143 
00144         scorer->setSmoothing( _matchSmoothingRule( "RawScorerNode", fieldName, op ) );
00145       }
00146 
00147       void after( indri::lang::NestedRawScorerNode* scorer ) {
00148         after( (indri::lang::RawScorerNode *) scorer );
00149       }
00150 
00151       void after( indri::lang::ShrinkageScorerNode* scorer ) {
00152         after( (indri::lang::RawScorerNode *) scorer );
00153 
00154         for( int i=signed(_rules.size())-1; i >= 0; i-- ) {
00155           const rule_type& rule = *_rules[i];
00156 
00157           if( rule.node == "ShrinkageBelief" &&
00158               rule.op == "*" ) {
00159             if ( rule.field == "*" ) {
00160               scorer->addShrinkageRule( rule.smoothing );
00161             } else {
00162               std::string ruleString = "field:" + rule.field + "," + rule.smoothing;
00163               scorer->addShrinkageRule( ruleString );
00164             }
00165           }
00166         }
00167       }
00168 
00169       void after( indri::lang::LengthPrior* prior ) {
00170         std::string ruleText = _matchSmoothingRule( "LengthPrior", "*", "*" );
00171         double exponent = 0;
00172 
00173         int nextComma = 0;
00174         int nextColon = 0;
00175         int location = 0;
00176         
00177         for( location = 0; location < ruleText.length(); ) {
00178           nextComma = ruleText.find( ',', location );
00179           nextColon = ruleText.find( ':', location );
00180 
00181           std::string key = ruleText.substr( location, nextColon-location );
00182           std::string value = ruleText.substr( nextColon+1, nextComma-nextColon-1 );
00183 
00184           if( key == "exponent" ) {
00185             exponent = atof( value.c_str() );       
00186           } 
00187           if( nextComma > 0 )
00188             location = nextComma+1;
00189           else
00190             location = ruleText.size();
00191         }
00192         
00193         prior->setExponent( exponent );
00194       }
00195     };
00196   }
00197 }
00198 
00199 #endif // INDRI_SMOOTHINGANNOTATORWALKER_HPP
00200 

Generated on Tue Jun 15 11:02:55 2010 for Lemur by doxygen 1.3.4