Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

indri::parse::TextTokenizer Class Reference

#include <TextTokenizer.hpp>

Inheritance diagram for indri::parse::TextTokenizer:

indri::parse::Tokenizer indri::parse::ObjectHandler< UnparsedDocument > List of all members.

Public Member Functions

 TextTokenizer (bool tokenize_markup=true, bool tokenize_entire_words=true)
 ~TextTokenizer ()
TokenizedDocumenttokenize (UnparsedDocument *document)
void handle (UnparsedDocument *document)
void setHandler (ObjectHandler< TokenizedDocument > &h)

Protected Member Functions

void processASCIIToken ()
void processUTF8Token ()
void processTag ()

Protected Attributes

indri::utility::Buffer _termBuffer
UTF8Transcoder _transcoder
bool _tokenize_markup
bool _tokenize_entire_words

Private Member Functions

void writeToken (char *token, int token_len, int extent_begin, int extent_end)

Private Attributes

ObjectHandler< TokenizedDocument > * _handler
TokenizedDocument _document

Constructor & Destructor Documentation

indri::parse::TextTokenizer::TextTokenizer bool  tokenize_markup = true,
bool  tokenize_entire_words = true
[inline]
 

indri::parse::TextTokenizer::~TextTokenizer  )  [inline]
 


Member Function Documentation

void indri::parse::TextTokenizer::handle UnparsedDocument document  )  [virtual]
 

Implements indri::parse::Tokenizer.

void indri::parse::TextTokenizer::processASCIIToken  )  [protected]
 

void indri::parse::TextTokenizer::processTag  )  [protected]
 

void indri::parse::TextTokenizer::processUTF8Token  )  [protected]
 

void indri::parse::TextTokenizer::setHandler ObjectHandler< TokenizedDocument > &  h  )  [virtual]
 

Implements indri::parse::Tokenizer.

TokenizedDocument* indri::parse::TextTokenizer::tokenize UnparsedDocument document  )  [virtual]
 

Implements indri::parse::Tokenizer.

void indri::parse::TextTokenizer::writeToken char *  token,
int  token_len,
int  extent_begin,
int  extent_end
[private]
 


Member Data Documentation

TokenizedDocument indri::parse::TextTokenizer::_document [private]
 

ObjectHandler<TokenizedDocument>* indri::parse::TextTokenizer::_handler [private]
 

indri::utility::Buffer indri::parse::TextTokenizer::_termBuffer [protected]
 

bool indri::parse::TextTokenizer::_tokenize_entire_words [protected]
 

bool indri::parse::TextTokenizer::_tokenize_markup [protected]
 

UTF8Transcoder indri::parse::TextTokenizer::_transcoder [protected]
 


The documentation for this class was generated from the following file:
Generated on Tue Jun 15 11:03:03 2010 for Lemur by doxygen 1.3.4