Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

TextParser.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // TextParser
00015 //
00016 // 16 August 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_TEXTPARSER_HPP
00020 #define INDRI_TEXTPARSER_HPP
00021 
00022 #include <stdio.h>
00023 #include <ctype.h>
00024 #include <string.h>
00025 #include <string>
00026 #include <vector>
00027 #include "indri/IndriParser.hpp"
00028 #include "indri/Buffer.hpp"
00029 #include "indri/ConflationPattern.hpp"
00030 #include "string-set.h"
00031 namespace indri
00032 {
00033   namespace parse
00034   {
00035     
00036     class TextParser : public Parser {
00037     public:
00038       TextParser();
00039       ~TextParser();
00040   
00041       indri::api::ParsedDocument* parse( TokenizedDocument* document );
00042 
00043       void handle( TokenizedDocument* document );
00044       void setHandler( ObjectHandler<indri::api::ParsedDocument>& h );
00045 
00046       void setTags( const std::vector<std::string>& include,
00047                     const std::vector<std::string>& exclude,
00048                     const std::vector<std::string>& index,
00049                     const std::vector<std::string>& metadata, 
00050                     const std::map<ConflationPattern*, std::string>& conflations );
00051 
00052     protected:
00053       void writeToken(char* token);
00054       void writeToken(char *token, int start, int end);
00055       indri::utility::Buffer _termBuffer;
00056 
00057     private:
00058       ObjectHandler<indri::api::ParsedDocument>* _handler;
00059       indri::api::ParsedDocument _document;
00060     };
00061   }
00062 }
00063 
00064 #endif // INDRI_TEXTPARSER_HPP
00065 

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4