Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

Porter_Stemmer.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2005 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 /* This is the Porter stemming algorithm, coded up in ANSI C by the
00014    author. It may be be regarded as cononical, in that it follows the
00015    algorithm presented in
00016 
00017    Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
00018    no. 3, pp 130-137,
00019 
00020    only differing from it at the points maked --DEPARTURE-- below.
00021 
00022    See also http://www.omsee.com/~martin/stem.html
00023 
00024    The algorithm as described in the paper could be exactly replicated
00025    by adjusting the points of DEPARTURE, but this is barely necessary,
00026    because (a) the points of DEPARTURE are definitely improvements, and
00027    (b) no encoding of the Porter stemmer I have seen is anything like
00028    as exact as this version, even with the points of DEPARTURE!
00029 
00030    You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
00031    'stem' takes a list of inputs and sends the stemmed equivalent to
00032    stdout.
00033 
00034    The algorithm as encoded here is particularly fast.
00035 
00036    Release 1
00037 
00038    07/31/2005 -- Rewrite to be a standalone object.
00039 */
00040 
00041 
00042 /* The main part of the stemming algorithm starts here. b is a buffer
00043    holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
00044    ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
00045    downwards as the stemming progresses. Zero termination is not in fact
00046    used in the algorithm.
00047 
00048    Note that only lower case sequences are stemmed. Forcing to lower case
00049    should be done before stem(...) is called.
00050 */
00051 #ifndef _PORTER_STEMMER_H_
00052 #define _PORTER_STEMMER_H_
00053 #include "indri/Mutex.hpp"
00054 #include "indri/ScopedLock.hpp"
00055 
00056 namespace indri
00057 {
00058   namespace parse 
00059   {
00060     class Porter_Stemmer 
00061     {
00062     private:
00063       indri::thread::Mutex _stemLock;
00064       char * b;       /* buffer for word to be stemmed */
00065       int k,k0,j;     /* j is a general offset into the string */
00066 
00067       /* cons(i) is TRUE <=> b[i] is a consonant. */
00068       bool cons(int i);
00069 
00070       /* m() measures the number of consonant sequences between k0 and j. if c is
00071          a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
00072          presence,
00073 
00074          <c><v>       gives 0
00075          <c>vc<v>     gives 1
00076          <c>vcvc<v>   gives 2
00077          <c>vcvcvc<v> gives 3
00078          ....
00079       */
00080 
00081       int m();
00082 
00083       /* vowelinstem() is TRUE <=> k0,...j contains a vowel */
00084 
00085       bool vowelinstem();
00086 
00087       /* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */
00088 
00089       bool doublec(int j);
00090 
00091       /* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
00092          and also if the second c is not w,x or y. this is used when trying to
00093          restore an e at the end of a short word. e.g.
00094 
00095          cav(e), lov(e), hop(e), crim(e), but
00096          snow, box, tray.
00097 
00098       */
00099 
00100       bool cvc(int i);
00101 
00102       /* ends(s) is TRUE <=> k0,...k ends with the string s. */
00103 
00104       bool ends(const char * s);
00105 
00106       /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
00107          k. */
00108 
00109       void setto(const char * s);
00110 
00111       /* r(s) is used further down. */
00112 
00113       void r(const char * s);
00114 
00115       /* step1ab() gets rid of plurals and -ed or -ing. e.g.
00116 
00117       caresses  ->  caress
00118       ponies    ->  poni
00119       ties      ->  ti
00120       caress    ->  caress
00121       cats      ->  cat
00122 
00123       feed      ->  feed
00124       agreed    ->  agree
00125       disabled  ->  disable
00126 
00127       matting   ->  mat
00128       mating    ->  mate
00129       meeting   ->  meet
00130       milling   ->  mill
00131       messing   ->  mess
00132 
00133       meetings  ->  meet
00134 
00135       */
00136 
00137       void step1ab();
00138 
00139       /* step1c() turns terminal y to i when there is another vowel in the stem. */
00140 
00141       void step1c();
00142 
00143 
00144       /* step2() maps double suffices to single ones. so -ization ( = -ize plus
00145          -ation) maps to -ize etc. note that the string before the suffix must give
00146          m() > 0. */
00147 
00148       void step2();
00149 
00150       /* step3() deals with -ic-, -full, -ness etc. similar strategy to step2. */
00151 
00152       void step3();
00153 
00154       /* step4() takes off -ant, -ence etc., in context <c>vcvc<v>. */
00155 
00156       void step4();
00157 
00158       /* step5() removes a final -e if m() > 1, and changes -ll to -l if
00159          m() > 1. */
00160 
00161       void step5();
00162     public:
00163       /* In stem(p,i,j), p is a char pointer, and the string to be stemmed is from
00164          p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last
00165          character of a string, (p[j+1] == '\0'). The stemmer adjusts the
00166          characters p[i] ... p[j] and returns the new end-point of the string, k.
00167          Stemming never increases word length, so i <= k <= j. To turn the stemmer
00168          into a module, declare 'stem' as extern, and delete the remainder of this
00169          file.
00170       */
00181       int porter_stem(char * p, int i, int j);
00182     };
00183   }
00184 }
00185 #endif /* _PORTER_STEMMER_H_*/

Generated on Tue Jun 15 11:02:55 2010 for Lemur by doxygen 1.3.4