Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

DocFreqIndexer.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2000-2004 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software (and below), and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 #ifndef _DOCFREQINDEXER_HPP
00013 #define _DOCFREQINDEXER_HPP
00014 
00015 #include "TextHandler.hpp"
00016 #include "PushIndex.hpp"
00017 #include "InvFPTermList.hpp"
00018 #include "Parser.hpp"
00019 #include "WordSet.hpp"
00020 
00021 #include <stdio.h>
00022 
00023 
00024 namespace lemur 
00025 {
00026   namespace distrib
00027   {
00028     
00029     class DocFreqIndexer : public lemur::api::TextHandler {
00030 
00031     public:
00032       DocFreqIndexer(const string &csName, const string &cwName, 
00033                      const string &ssName, int bufferSize, 
00034                      bool countStopWords = false);
00035       ~DocFreqIndexer();
00036 
00037       char * handleDoc(char * docno);
00038       char * handleWord(char * word);
00039       void handleEndDoc();
00040 
00041       void newDb(const string &name);
00042 
00043 
00044 
00045     private:
00046   
00047       int cw;
00048       int dfCount;
00049       bool first;
00050 
00051       lemur::index::PushIndex * collsel;
00052 
00053       lemur::parse::DocumentProps * csdp;
00054       lemur::index::InvFPTerm * term;
00055 
00056       lemur::utility::WordSet docWords;
00057 
00058       FILE * collWords;
00059       FILE * serverSizes;
00060       int numDocs;
00061   
00062       bool countStopWds;
00063 
00064     };
00065   }
00066 }
00067 
00068 #endif

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4