Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

SortMergeTextFiles.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004-2008 Carnegie Mellon University and University of
00003  * Massachusetts.  All Rights Reserved.
00004  *
00005  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00006  * is subject to the terms of the software license set forth in the LICENSE
00007  * file included with this software, and also available at
00008  * http://www.lemurproject.org/license.html
00009  *
00010  *==========================================================================
00011 */
00012 
00013 #ifndef _SORTMERGETEXTFILES_HPP
00014 #define _SORTMERGETEXTFILES_HPP
00015 
00016 // class to sort and merge multiple text files into one
00017 
00018 #include <time.h>
00019 #include <algorithm>
00020 #include <iostream>
00021 #include <fstream>
00022 #include <string>
00023 #include <sstream>
00024 #include <vector>
00025 #include <list>
00026 
00027 #include "indri/Buffer.hpp"
00028 #include "indri/Path.hpp"
00029 #include "indri/UtilityThread.hpp"
00030 #include "lemur-compat.hpp"
00031 #include "Exception.hpp"
00032 
00033 namespace lemur {
00034   namespace file {
00035 
00039     class FileMergeThread : public indri::thread::UtilityThread {
00040     public:
00041       enum {
00042         MAX_INPUT_FILES=16,
00043         MAX_INPUT_LINESIZE=65536
00044       };
00045 
00046     private:
00047       FILE *inputFile[MAX_INPUT_FILES];
00048       std::ofstream outfile;
00049 
00050       std::string filePath[MAX_INPUT_FILES];
00051       std::string outputFilePath;
00052 
00053       char _buffer[MAX_INPUT_FILES][MAX_INPUT_LINESIZE];
00054       bool fileDone[MAX_INPUT_FILES];
00055       //      char _outputBuffer[lemur::file::FileMergeThread::MAX_INPUT_LINESIZE];
00056       char _outputBuffer[2*1024*1024];
00057 
00058       int numInputFiles;
00059 
00060       int recordCounter;
00061       bool isActive;
00062 
00063       int chooseNextBuffer();
00064 
00065     public:
00066       FileMergeThread(std::vector<std::string> &inputFileList, const std::string& outputFile);
00067       ~FileMergeThread() { }
00068 
00069       virtual bool hasWork() { return false; }
00070       virtual UINT64 work();
00071       virtual UINT64 initialize();
00072       virtual void deinitialize();
00073 
00074       int getRecordCounter() { return recordCounter; }
00075       bool isThreadActive() { return isActive; }
00076 
00077     };
00078 
00079     class SortMergeTextFiles {
00080     protected:
00081       std::string _outputFilePath;
00082       std::string _tempDirectory;
00083 
00084       indri::utility::Buffer _inputBuffer;
00085       indri::utility::Buffer _inputBufferTwo;
00086 
00087       bool _displayStatus;
00088       int _numMergeThreads;
00089 
00090       std::string _flushChunks(std::string& basePathname, std::vector<std::string> *inMemRecords, int currentChunkNumber);
00091       int _mergeSortTwoFiles(std::string &firstFilePath, std::string &secondFilePath, std::string &outputFile, bool doCleanup=true);
00092       std::vector<std::string> _doMidFinalMerge(std::vector<std::string> &inputList, std::string &outputPathBase, int &recordCounter);
00093       int _doFinalMergesortFiles(std::vector<std::string> &inputFiles, std::string &outputFile);
00094 
00095       virtual void _doSingleFileMergesort(std::string &inputFile, std::string &outputFile, std::vector<std::string> &chunkList, int chunkRecordSize=16384*10);
00096 
00097     public:
00098       SortMergeTextFiles(std::string &outputFilePath, std::string &tempDirectory, int numMergeThreads=4, bool displayStatus=false);
00099       ~SortMergeTextFiles();
00100 
00101       int sort(std::vector<std::string> &inputFilePaths);
00102       static bool _readLine(FILE *_in, char*& beginLine, size_t& lineLength, indri::utility::Buffer &_buffer);
00103       void showStatus(bool displayStatus) { _displayStatus=displayStatus; }
00104 
00105     };
00106   } // end namespace file
00107 } // end namespace lemur
00108 
00109 #endif // _SORTMERGETEXTFILES_HPP

Generated on Tue Jun 15 11:02:55 2010 for Lemur by doxygen 1.3.4