Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

ConflationPattern.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2005 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // ConflationPattern
00015 //
00016 // 15 September 2005 -- mwb
00017 //
00018 
00019 // Data structure to support tag-attribute-value conflations at
00020 // parsing time.  To illustrate by example, consider these three
00021 // tags encountered in the source document text:
00022 //
00023 // <TAG ... ATT1="VAL1" ... />
00024 // <TAG ... ATT1="VAL2" ... />
00025 // <TAG ... ATT2="VAL1" ... />
00026 //
00027 // The pattern { "tag", null, null } matches all three.
00028 // The pattern { "tag", "att1", null } matches only the top two.
00029 // The pattern { "tag", "att1", "VAL2" } matches only the middle one.
00030 //
00031 // Note that the pattern { "tag", null, "VAL2" } is not valid.
00032 //
00033 // These patterns are defined in FileClassEnvironmentFactory, and are
00034 // passed to the ParserFactory when the FileClassEnvironment is
00035 // constructed.  The Parser will replace any tags that match the
00036 // pattern with a tag of a specified name that has no attributes.
00037 
00038 #ifndef INDRI_CONFLATIONPATTERN_HPP
00039 #define INDRI_CONFLATIONPATTERN_HPP
00040 
00041 #include <string.h>
00042 #include <functional>
00043 
00044 namespace indri {
00045   namespace parse {
00046 
00047     // The tag_name and attribute_name strings in the
00048     // ConflationPattern should always be downcased, but value should
00049     // appear as it does in the source document.
00050     
00051     struct ConflationPattern {
00052       const char* tag_name;
00053       const char* attribute_name;
00054       const char* value;
00055     };
00056 
00057   }
00058 }
00059 
00060 namespace std {
00061 
00062         template <>
00063         struct less<indri::parse::ConflationPattern *> {
00064     
00065     bool operator() ( const indri::parse::ConflationPattern* one, 
00066                       const indri::parse::ConflationPattern* two ) const {
00067 
00068       // First compare tag_name, then attribute_name, then value.
00069       // Comparison is lexical ordering according to strcmp.  Recall
00070       // that tag_name and attribute_name should always be downcased
00071       // in a ConflationPattern, so this leads to a case-insensitive
00072       // match.  A value NULL for any entry in the pattern ( which is
00073       // interpreted as a wildcard ), always comes first.
00074     
00075       // { NULL, NULL, NULL } always comes first.
00076       // { x, NULL, NULL } always comes before { x, y, NULL }
00077       // { x, y, NULL } always comes before { x, y, z }
00078     
00079       // Return true if ConflationPattern one precedes
00080       // ConflationPattern two; false otherwise.
00081       
00082       int r = 0;
00083 
00084       // tag_name
00085     
00086       if ( one->tag_name && two->tag_name )
00087         r = strcmp( one->tag_name, two->tag_name );
00088       else if ( ! one->tag_name ) return true;
00089       else if ( ! two->tag_name ) return false;
00090 
00091       if ( r != 0 ) return ( r < 0 );
00092         
00093       // attribute_name
00094     
00095       if ( one->attribute_name && two->attribute_name )
00096         r = strcmp( one->attribute_name, two->attribute_name );
00097       else if ( ! one->attribute_name ) return true;
00098       else if ( ! two->attribute_name ) return false;
00099     
00100       if ( r != 0 ) return ( r < 0 );
00101 
00102       // value
00103     
00104       if ( one->value && two->value )
00105         r = strcmp( one->value, two->value );
00106       else if ( ! one->value ) return true;
00107       else if ( ! two->value ) return false;
00108     
00109       if ( r != 0 ) return ( r < 0 );
00110 
00111       // If both ConflationPatterns are equal, neither precedes the
00112       // other.
00113 
00114       return false;
00115 
00116     }
00117   };
00118 }
00119 
00120 #endif // INDRI_CONFLATIONPATTERN_HPP

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4