Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

KrovetzStemmer.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2005 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 // dmf
00012 // C++ thread safe implementation of the Krovetz stemmer.
00013 // requires no external data files.
00014 // 07/29/2005
00015 #ifndef _KROVETZ_STEMMER_H_
00016 #define _KROVETZ_STEMMER_H_
00017 #include <cstring>
00018 #ifdef WIN32
00019 #include <hash_map>
00020 #else
00021 // Move this somewhere
00022 #ifndef HAVE_GCC_VERSION
00023 #define HAVE_GCC_VERSION(MAJOR, MINOR) \
00024   (__GNUC__ > (MAJOR) || (__GNUC__ == (MAJOR) && __GNUC_MINOR__ >= (MINOR)))
00025 #endif /* ! HAVE_GCC_VERSION */
00026 #if HAVE_GCC_VERSION(4,3)
00027 // if GCC 4.3+
00028 #include <tr1/unordered_map>
00029 #else
00030 #include <ext/hash_map>
00031 #endif
00032 // 3.3 does not use __gnu_cxx, 3.4+ does.
00033 using namespace __gnu_cxx;
00034 #endif
00035 #include "indri/Mutex.hpp"
00036 #include "indri/ScopedLock.hpp"
00037 
00038 namespace indri
00039 {
00040   namespace parse 
00041   {
00042     class KrovetzStemmer 
00043     {
00044     public:
00045       KrovetzStemmer();
00046       ~KrovetzStemmer();
00048       static const int MAX_WORD_LENGTH=25;
00060       char * kstem_stemmer(char *term);
00072       int kstem_stem_tobuffer(char *term, char *buffer);
00080       void kstem_add_table_entry(const char* variant, const char* word, 
00081                                  bool exc=false);
00082     private:
00084       indri::thread::Mutex _stemLock;
00086       typedef struct dictEntry {
00088         bool exception;      
00090         const char *root;
00091       } dictEntry;
00093       typedef struct cacheEntry {
00095         char flag; 
00097         char word1[MAX_WORD_LENGTH];
00099         char stem1[MAX_WORD_LENGTH];
00101         char word2[MAX_WORD_LENGTH];
00103         char stem2[MAX_WORD_LENGTH];
00104       } cacheEntry;
00105 
00106       // operates on atribute word.
00107       bool ends(const char *s, int sufflen);
00108       void setsuff(const char *str, int length);
00109       dictEntry *getdep(char *word);
00110       bool lookup(char *word);
00111       bool cons(int i);
00112       bool vowelinstem();
00113       bool vowel(int i);
00114       bool doublec(int i);
00115       void plural();
00116       void past_tense();
00117       void aspect();
00118       void ion_endings();
00119       void er_and_or_endings ();
00120       void ly_endings ();
00121       void al_endings() ;
00122       void ive_endings() ;
00123       void ize_endings() ;
00124       void ment_endings() ;
00125       void ity_endings() ;
00126       void ble_endings() ;
00127       void ness_endings() ;
00128       void ism_endings();
00129       void ic_endings();
00130       void ncy_endings();
00131       void nce_endings();
00132       // maint.
00133       void loadTables();
00134 #ifdef WIN32
00135       struct ltstr {
00136         bool operator()(const char* s1, const char* s2) const {
00137           return strcmp(s1, s2) < 0;
00138         }
00139       };
00140       //studio 7 hash_map provides hash_compare, rather than hash
00141       // needing an < predicate, rather than an == predicate.
00142       typedef stdext::hash_map<const char *, dictEntry, stdext::hash_compare<const char *, ltstr> > dictTable;
00143 #else
00144       struct eqstr {
00145         bool operator()(const char* s1, const char* s2) const {
00146           return strcmp(s1, s2) == 0;
00147         }
00148       };
00149 #if HAVE_GCC_VERSION(4,3)
00150       typedef std::tr1::unordered_map<const char *, dictEntry, std::tr1::hash<const char *>, eqstr> dictTable;
00151 #else
00152       typedef hash_map<const char *, dictEntry, hash<const char *>, eqstr> dictTable;
00153 #endif
00154 #endif
00155       dictTable dictEntries;
00156       // this needs to be a bounded size cache.
00157       // kstem.cpp uses size 30013 entries.
00158       cacheEntry *stemCache;
00159       // size
00160       int stemhtsize;
00161       // state
00162       // k = wordlength - 1
00163       int k;
00164       // j is stemlength - 1
00165       int j;
00166       // pointer to the output buffer
00167       char *word;
00168       // used by kstem_stemmer to return a safe value.
00169       char stem[MAX_WORD_LENGTH];
00170     };
00171   }
00172 }
00173 #endif /* _KROVETZ_STEMMER_H_*/

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4