Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

AnchorTextWriter.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // AnchorTextWriter
00015 //
00016 // 20 May 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_ANCHORTEXTWRITER_HPP
00020 #define INDRI_ANCHORTEXTWRITER_HPP
00021 
00022 #include <iostream>
00023 #include <algorithm>
00024 #include "indri/Path.hpp"
00025 #include "lemur-compat.hpp"
00026 namespace indri
00027 {
00028   namespace parse
00029   {
00031     class AnchorTextWriter : public ObjectHandler<indri::api::ParsedDocument> {
00032     private:
00033       std::ofstream _out;
00034 
00035     public:
00036       AnchorTextWriter( const std::string& outputPath ) {
00037         std::string directory = indri::file::Path::directory( outputPath );
00038         indri::file::Path::make( directory );
00039         _out.open( outputPath.c_str(), std::ios::out | std::ios::binary);
00040       }
00041 
00042       ~AnchorTextWriter() {
00043         _out.close();
00044       }
00045 
00046       void handle( indri::api::ParsedDocument* document ) {
00047         indri::utility::greedy_vector<MetadataPair>::iterator iter;
00048 
00049         iter = std::find_if( document->metadata.begin(),
00050                              document->metadata.end(),
00051                              MetadataPair::key_equal( "docno" ) );
00052 
00053         const char* docno = (char*)iter->value;
00054 
00055         iter = std::find_if( document->metadata.begin(),
00056                              document->metadata.end(),
00057                              MetadataPair::key_equal( "url" ) );
00058 
00059         const char* page = (char*)iter->value;
00060         const char* url = 0;
00061         int count = 0;
00062         int urlEnd = -1;
00063 
00064         // find the third slash, which should occur
00065         // right after the domain name
00066         const char* slash = 0;
00067         if(page)  slash = strchr( page, '/' );
00068         if(slash) slash = strchr( slash+1, '/' );
00069         if(slash) slash = strchr( slash+1, '/' );
00070 
00071         size_t domainLength;
00072         if( slash )
00073           domainLength = slash - page;
00074         else
00075           domainLength = strlen(page);
00076 
00077         // count links
00078         for( unsigned int i=0; i<document->tags.size(); i++ ) {
00079           TagExtent& extent = *(document->tags[i]);
00080 
00081           // we only extract absolute urls
00082           if( !strcmp( extent.name, "absolute-url" ) ||
00083               !strcmp( extent.name, "relative-url" ) ) {
00084             url = document->terms[ extent.begin ];
00085             urlEnd = extent.end;
00086 
00087             // if it has the same domain, throw it out
00088             //if( url && page && !lemur_compat::strncasecmp( url, page, domainLength ) ) {
00089             //  url = 0;
00090             //  urlEnd = -1;
00091             //}
00092           } else if( !strcmp( extent.name, "a" ) &&  // this is anchor text
00093                      url &&                          // we've seen a url
00094                      urlEnd == extent.begin &&       // this text is associated with an absolute-url
00095                      extent.end - extent.begin > 0 ) // there is some text here
00096             {
00097               count++;
00098               url = 0;
00099             }
00100         }
00101 
00102         // print output
00103         _out << "DOCNO=" << docno << std::endl;
00104         _out << "DOCURL=" << page << std::endl;
00105         _out << "LINKS=" << count << std::endl;
00106         url = 0;
00107         urlEnd = -1;
00108 
00109         for( unsigned int i=0; i<document->tags.size(); i++ ) {
00110           TagExtent& extent = *(document->tags[i]);
00111 
00112           if( !strcmp( extent.name, "absolute-url" ) ||
00113               !strcmp( extent.name, "relative-url" ) ) {  // this is an absolute url
00114             url = document->terms[ extent.begin ];
00115             urlEnd = extent.end;
00116 
00117             // if it has the same domain, throw it out
00118             //if( url && page && !lemur_compat::strncasecmp( url, page, domainLength ) ) {
00119             //  url = 0;
00120             //  urlEnd = -1;
00121             //}
00122           } else if( !strcmp( extent.name, "a" ) &&  // this is anchor text
00123                      url &&                          // we've seen a url
00124                      urlEnd == extent.begin &&       // this text is associated with an absolute-url
00125                      extent.end - extent.begin > 0 ) // there is some text here
00126             {
00127               int textLength = 0;
00128 
00129               _out << "LINKURL=" << url << std::endl;
00130               _out << "TEXT=\"";
00131               for( size_t j=extent.begin; int(j) < extent.end && textLength < 60000; j++ ) {
00132                 if( !document->terms[j] )
00133                   continue;
00134 
00135                 textLength += strlen(document->terms[j])+1;
00136                 _out << document->terms[j] << " ";
00137               }
00138               _out << "\"" << std::endl;
00139 
00140               // only do the same link once
00141               url = 0;
00142             }
00143         }
00144       }
00145     };
00146   }
00147 }
00148 
00149 #endif // INDRI_ANCHORTEXTWRITER_HPP
00150 

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4