Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

HTMLParser.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // HTMLParser
00015 //
00016 // March 2004 -- metzler
00017 //
00018 #ifndef HTMLPARSER_HPP
00019 #define HTMLPARSER_HPP
00020 #include "indri/TaggedTextParser.hpp"
00021 
00022 #ifndef MAX_URL_LENGTH
00023 #define MAX_URL_LENGTH 4096
00024 #endif
00025 namespace indri
00026 {
00027   namespace parse
00028   {
00029     
00030     class HTMLParser : public TaggedTextParser {
00031     public:
00032       HTMLParser() {
00033       }
00034   
00035       ~HTMLParser() { }
00036 
00037     protected:
00038       virtual void initialize( TokenizedDocument* tokenized, indri::api::ParsedDocument* parsed );
00039       virtual void cleanup( TokenizedDocument* tokenized, indri::api::ParsedDocument* parsed );
00040       virtual void handleTag(TagEvent *te);
00041       char url[MAX_URL_LENGTH];
00042       char base_url[MAX_URL_LENGTH];
00043       void prepURL(char *s);
00044       bool normalizeURL(char *s);
00045 
00046       tag_properties* _relativeUrlTag;
00047       tag_properties* _absoluteUrlTag;
00048       tag_properties* _anchorTag;
00049 
00050       indri::utility::Buffer _urlBuffer;
00051     };
00052   }
00053 }
00054 
00055 #endif

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4