Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

PageRank.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2002-2008 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 //
00012 // PageRank
00013 //
00014 // 17 July 2005 -- dam
00015 //
00016 
00017 #ifndef INDRI_PAGERANK_HPP
00018 #define INDRI_PAGERANK_HPP
00019 
00020 #include <string>
00021 #include <vector>
00022 #include <map>
00023 #include <cmath>
00024 #include "indri/UnparsedDocument.hpp"
00025 #include "indri/Parameters.hpp"
00026 #include "indri/FileTreeIterator.hpp"
00027 #include "indri/TaggedDocumentIterator.hpp"
00028 #include "indri/TaggedTextParser.hpp"
00029 #include "indri/Path.hpp"
00030 
00031 #include "indri/Repository.hpp"
00032 
00033 namespace indri
00034 {
00035   namespace parse
00036   {
00037     class pagerank 
00038     {
00039     public:
00040       std::string doc;
00041       float val;
00042       int int_val;
00043       struct pagerank_greater 
00044       {
00045         bool operator() (const pagerank &one, const pagerank &two) 
00046         {
00047           if (one.val == two.val) // match previous perl script secondary sort
00048             return one.doc > two.doc;
00049           return one.val > two.val;
00050         }
00051       };
00052     };
00053     class prEntry {
00054     public:
00055       lemur::api::DOCID_T doc;
00056       float val;
00057       int int_val;
00058       struct prEntry_greater {
00059         bool operator() (const prEntry &one, const prEntry &two) {
00060           if (one.val == two.val)
00061             return one.doc > two.doc;
00062           return one.val > two.val;
00063         }
00064       };
00065     };
00066     
00067     class PageRank {
00068     private:
00069       // from Metztler's generate_priors.pl
00070       static const double _intToProb[11];
00071       float *prTable;
00072       
00073       const std::string _corpusPath;
00074       const std::string _linkPath;
00075 
00076       double _c; // dampening parameter
00077       UINT64 _colLen; // collection length
00078 
00079       indri::collection::Repository _repository;
00080       typedef std::map< std::string, float > PageRankVector;
00081       typedef std::map< std::string, std::pair< int, std::vector< std::string > > > Links;
00082 
00083       inline void _swap( std::string& a, std::string& b ) {
00084         std::string tmp = b;
00085         b = a;
00086         a = tmp;
00087       }
00088 
00089       void _computeColLen();
00090       
00091       float _readPageRankFromFile( std::ifstream& src, const std::string& sourceDoc );
00092       void _writePageRankToFile( std::ofstream& src, const std::string& destDoc, const float pr );
00093 
00094       void _computeOutDegrees( Links& links );
00095       void _doPageRankIter( const int docsPerIter, const std::string& srcFile, const std::string& destFile );
00096       void _updatePageRank( std::ifstream& src, std::ofstream& dest, Links& links );
00097 
00098       void _raw2int(std::vector<pagerank> &);
00099       void _ranks2int(std::vector<prEntry> &ranks);
00100       
00101       void _loadRanks( const std::string& dest, 
00102                        std::vector<pagerank> &pageranks);
00103       
00104     public:
00105       PageRank( const std::string& corpusPath, const std::string& linkPath, 
00106                 UINT64 colLen = 0 ) : _corpusPath( corpusPath ), 
00107                                       _linkPath( linkPath ),
00108                                       _colLen( colLen ), prTable(0) {
00109         if (_colLen == 0 ) _computeColLen();
00110       }
00111       PageRank( const std::string& corpusPath, const std::string& linkPath, 
00112                 const std::string& indexPath ) : _corpusPath( corpusPath ), 
00113                                                  _linkPath( linkPath ), prTable(0) {
00114 
00115         _repository.openRead(indexPath);
00116         indri::collection::Repository::index_state indexes = _repository.indexes();
00117         _colLen = 0;
00118         for( int i=0; i<indexes->size(); i++ ) {
00119           _colLen += (*indexes)[i]->documentCount();
00120         }
00121       }
00122       ~PageRank( ) {
00123         delete(prTable);
00124       }
00125       
00126       void computePageRank( const std::string& outputFile, const int maxIters = 10, const int docsPerIter = 1000, const double c = 0.7 );
00127       void writeRaw( const std::string& dest, const std::string &fawFile  );
00128       void writePriors( const std::string& dest, const std::string &priorFile  );
00129       void writeRanks( const std::string& dest, const std::string &ranksFile  );
00130 
00131       void indexPageRank(const std::string& outputFile, const int maxIters = 100, const double c = 0.85 );
00132     };
00133   }
00134 }
00135 
00136 #endif

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4