/** * Hadoop FileInputFormat for reading WARC files * * (C) 2009 - Carnegie Mellon University * * 1. Redistributions of this source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The names "Lemur", "Indri", "University of Massachusetts", * "Carnegie Mellon", and "lemurproject" must not be used to * endorse or promote products derived from this software without * prior written permission. To obtain permission, contact * license@lemurproject.org. * * 4. Products derived from this software may not be called "Lemur" or "Indri" * nor may "Lemur" or "Indri" appear in their names without prior written * permission of The Lemur Project. To obtain permission, * contact license@lemurproject.org. * * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09 * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * @author mhoy@cs.cmu.edu (Mark J. Hoy) */ package edu.cmu.lemurproject; import java.io.IOException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; public class WarcFileInputFormat extends FileInputFormat { /** * Don't allow the files to be split! */ @Override protected boolean isSplitable(FileSystem fs, Path filename) { // ensure the input files are not splittable! return false; } /** * Just return the record reader */ public RecordReader getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { return new WarcFileRecordReader(conf, split); } }