Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

HarvestSortMerge.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004-2008 Carnegie Mellon University and University of
00003  * Massachusetts.  All Rights Reserved.
00004  *
00005  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00006  * is subject to the terms of the software license set forth in the LICENSE
00007  * file included with this software, and also available at
00008  * http://www.lemurproject.org/license.html
00009  *
00010  *==========================================================================
00011 */
00012 
00013 #ifndef _HARVESTSORTMERGE_HPP
00014 #define _HARVESTSORTMERGE_HPP
00015 
00016 // class to sort and merge multiple text files into one
00017 // with early rejection of URLs that do not exist within the keyfile
00018 
00019 #include "SortMergeTextFiles.hpp"
00020 #include "Keyfile.hpp"
00021 #include "SHA1.hpp"
00022 
00023 namespace lemur {
00024   namespace file {
00025 
00026     class HarvestSortMerge : public SortMergeTextFiles {
00027     protected:
00028       lemur::file::Keyfile *_docNoKeyfile;
00029           lemur::utility::SHA1 SHA1Hasher;
00030       virtual void _doSingleFileMergesort(std::string &inputFile, std::string &outputFile, std::vector<std::string> &chunkList, int chunkRecordSize=16384*10);
00031 
00032     public:
00033       HarvestSortMerge(std::string &outputFilePath, std::string &tempDirectory, lemur::file::Keyfile *docNoKeyfile, int numMergeThreads=4, bool displayStatus=false);
00034       ~HarvestSortMerge();
00035 
00036       static void splitLineOnTabs(char *inputLine, std::vector<std::string> &retVec);
00037 
00038     }; // end class HarvestSortMerge
00039   } // end namespace file
00040 } // end namespace lemur
00041 
00042 #endif // _HARVESTSORTMERGE_HPP

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4