Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

Arabic_Stemmer.hpp

Go to the documentation of this file.
00001 
00002 /**************************************************************************/
00003 /**************************************************************************/
00004 /**************            ARABIC STEMMER HEADER FILE         *************/
00005 /**************************************************************************/
00006 /**************************************************************************/
00007 
00008 /*
00009 
00010 Copyright (c) 2001 UMASS CIIR All rights reserved.
00011 Written by Nick Dufresne (nickd@cs.brandeis.edu)
00012         
00013 08/01/2005 -- rewrite as standalone thread safe clase.  
00014 */
00015 #include <string>
00016 #include <cstring>
00017 #include <set>
00018 
00019 namespace lemur
00020 {
00021   namespace parse
00022   {
00023     class Arabic_Stemmer 
00024     {
00025     public:
00026       Arabic_Stemmer(std::string stemFunc);
00027       ~Arabic_Stemmer();
00028       // stem a term.
00029       void stemTerm(char *, char *);
00030 
00031     private:
00032       void (Arabic_Stemmer::*stem_fct)(char *, char *) ;
00033       void arabic_remove_diacritics (char *, char *);
00034       //removes diacritics from word
00035       void arabic_stop(char *, char *);  // only removes stops
00036       void no_stem(char *, char *) ;       // doesn't do anything
00037       //normalize arabic word
00038       void arabic_norm2(char *, char *);
00039       void arabic_norm2_stop(char *, char *);
00040       void arabic_light10(char *, char *);   
00041       void arabic_light10_stop(char *, char *);         
00042       // stopwords hash table.
00043       struct ltstr {
00044         bool operator()(const char* s1, const char* s2) const {
00045           return strcmp(s1, s2) < 0;
00046         }
00047       };
00048       std::set<const char *, ltstr> stop_words_ht;
00049       bool on_stop_list (char *word);
00050     // pointer to member function
00051       typedef struct {
00052         const char *option;
00053         void (Arabic_Stemmer::*stem_fct)(char *, char *) ;
00054       } stem_info_t;
00055       static stem_info_t stemtable[];
00056       static const int ArabicVowel[256];
00057       static const int Norm3Char[256];
00058       static const int NormChar[256];
00059       static const int isWhitespace[256];
00060       static const char *stopwords[];
00061       static const char *suffixes[];
00062       static const char *defarticles[];
00063       int is_whitespace (const char c);
00064       void remove_definite_articles(char *word, char *result);
00065       void remove_all_suffixes(char *word, char *result, size_t lenlimit);
00066     };
00067   }
00068 }
00069 
00070     

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4