@Test
public void readExcelInputFormatBzip2CompressedExcel2013MultiSheetAll() throws IOException {
JobConf job = new JobConf(defaultConf);
CompressionCodec bzip2 = new BZip2Codec();
ReflectionUtils.setConf(bzip2, job);
ClassLoader classLoader = getClass().getClassLoader();
String fileName="excel2013testmultisheet.xlsx.bz2";
String fileNameSpreadSheet=classLoader.getResource(fileName).getFile();
Path file = new Path(fileNameSpreadSheet);
FileInputFormat.setInputPaths(job, file);
// set locale to the one of the test data
job.set("hadoopoffice.read.locale.bcp47","de");
ExcelFileInputFormat format = new ExcelFileInputFormat();
format.configure(job);
InputSplit[] inputSplits = format.getSplits(job,1);
assertEquals(1,inputSplits.length,"Only one split generated for Excel file");
RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull(reader,"Format returned null RecordReader");
Text spreadSheetKey = new Text();
ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 1 (first sheet)");
assertEquals("[excel2013testmultisheet.xlsx.bz2]Sheet1!A1",spreadSheetKey.toString(),"Input Split for Excel file has keyname == \"[excel2013testmultisheet.xlsx.bz2]Sheet1!A1\"");
assertEquals(4,spreadSheetValue.get().length,"Input Split for Excel file contains row 1 with 4 columns");
assertEquals("test1",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
assertEquals("Sheet1",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getSheetName(),"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
assertEquals("A1",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getAddress(),"Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
assertEquals("test2",((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
assertEquals("test3",((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
assertEquals("test4",((SpreadSheetCellDAO)spreadSheetValue.get()[3]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 4 == \"test4\"");
assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 2 (first sheet)");
assertEquals(1,spreadSheetValue.get().length,"Input Split for Excel file contains row 2 with 1 column");
assertEquals("4",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 2 with cell 1 == \"4\"");
assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 3 (first sheet)");
assertEquals(5,spreadSheetValue.get().length,"Input Split for Excel file contains row 3 with 5 columns");
assertEquals("31/12/99",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 3 with cell 1 == \"31/12/99\"");
assertEquals("5",((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 3 with cell 2 == \"5\"");
assertNull(spreadSheetValue.get()[2],"Input Split for Excel file contains row 3 with cell 3 == null");
assertNull(spreadSheetValue.get()[3],"Input Split for Excel file contains row 3 with cell 4 == null");
assertEquals("null",((SpreadSheetCellDAO)spreadSheetValue.get()[4]).getFormattedValue(),"Input Split for Excel file contains row 3 with cell 5 == \"null\"");
assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 4 (first sheet)");
assertEquals(1,spreadSheetValue.get().length,"Input Split for Excel file contains row 4 with 1 column");
assertEquals("1",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 4 with cell 1 == \"1\"");
assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 5 (first sheet)");
assertEquals(3,spreadSheetValue.get().length,"Input Split for Excel file contains row 5 with 3 columns");
assertEquals("2",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 5 with cell 1 == \"2\"");
assertEquals("6",((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 5 with cell 2== \"6\"");
assertEquals("10",((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(),"Input Split for Excel file contains row 5 with cell 3== \"10\"");
assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 6 (first sheet)");
assertEquals(3,spreadSheetValue.get().length,"Input Split for Excel file contains row 6 with 3 columns");
assertEquals("3",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 6 with cell 1 == \"3\"");
assertEquals("4",((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 6 with cell 2== \"4\"");
assertEquals("15",((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(),"Input Split for Excel file contains row 6 with cell 3== \"15\"");
assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 7 (second sheet)");
assertEquals("8",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 7 with cell 1 == \"8\"");
assertEquals("99",((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 7 with cell 2 == \"99\"");
assertEquals(2,spreadSheetValue.get().length,"Input Split for Excel file contains row 7 with 2 columns");
assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 8 (second sheet)");
assertEquals(1,spreadSheetValue.get().length,"Input Split for Excel file contains row 8 with 1 column");
assertEquals("test",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 8 with cell 1 == \"test\"");
assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 9 (second sheet)");
assertEquals(3,spreadSheetValue.get().length,"Input Split for Excel file contains row 9 with 3 columns");
assertNull(spreadSheetValue.get()[0],"Input Split for Excel file contains row 9 with cell 1 == null");
assertNull(spreadSheetValue.get()[1],"Input Split for Excel file contains row 9 with cell 2 == null");
assertEquals("seven",((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(),"Input Split for Excel file contains row 9 with cell 3 == \"seven\"");
}
java类org.apache.hadoop.io.compress.BZip2Codec的实例源码
OfficeFormatHadoopExcelTest.java 文件源码
项目:hadoopoffice
阅读 23
收藏 0
点赞 0
评论 0
DataGenerator.java 文件源码
项目:pregelix
阅读 25
收藏 0
点赞 0
评论 0
public static void main(String[] args) throws IOException {
JobConf job = new JobConf(DataGenerator.class);
FileSystem dfs = FileSystem.get(job);
String maxFile = "/maxtemp";
dfs.delete(new Path(maxFile), true);
job.setJobName(DataGenerator.class.getSimpleName() + "max ID");
job.setMapperClass(MapMaxId.class);
job.setCombinerClass(CombineMaxId.class);
job.setReducerClass(ReduceMaxId.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(VLongWritable.class);
job.setInputFormat(TextInputFormat.class);
FileInputFormat.setInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, new Path(maxFile));
job.setNumReduceTasks(1);
JobClient.runJob(job);
job = new JobConf(DataGenerator.class);
job.set("hyracks.maxid.file", maxFile);
job.setInt("hyracks.x", Integer.parseInt(args[2]));
dfs.delete(new Path(args[1]), true);
job.setJobName(DataGenerator.class.getSimpleName());
job.setMapperClass(MapRecordGen.class);
job.setReducerClass(ReduceRecordGen.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormat(TextInputFormat.class);
FileInputFormat.setInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setNumReduceTasks(Integer.parseInt(args[3]));
if (args.length > 4) {
if (args[4].startsWith("bzip"))
FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
if (args[4].startsWith("gz"))
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
}
JobClient.runJob(job);
}
TestCompression.java 文件源码
项目:zSqoop
阅读 21
收藏 0
点赞 0
评论 0
public void testBzip2TextCompression() throws IOException {
runTextCompressionTest(new BZip2Codec(), 4);
}
TestCompression.java 文件源码
项目:zSqoop
阅读 21
收藏 0
点赞 0
评论 0
public void testBzip2SequenceFileCompression() throws Exception {
runSequenceFileCompressionTest(new BZip2Codec(), 4);
}
TestCompression.java 文件源码
项目:sqoop
阅读 20
收藏 0
点赞 0
评论 0
public void testBzip2TextCompression() throws IOException {
runTextCompressionTest(new BZip2Codec(), 4);
}
TestCompression.java 文件源码
项目:sqoop
阅读 21
收藏 0
点赞 0
评论 0
public void testBzip2SequenceFileCompression() throws Exception {
runSequenceFileCompressionTest(new BZip2Codec(), 4);
}
ManyTxtToFewSeqJob.java 文件源码
项目:MRSmallFileCombiner
阅读 23
收藏 0
点赞 0
评论 0
public static void main(String[] args) throws Exception {
if (args.length < 3) {
System.out
.println("ManyTxtToFewSeqJob <inputPath> <outputPath> <# mappers> <compressionCodec>");
System.out.println();
System.out
.println("Example: ManyTxtToFewSeqJob ./input ./output 20 snappy");
return;
}
// Get values from args
String inputPath = args[0];
String outputPath = args[1];
String numberOfMappers = args[2];
String compressionCodec = args[3];
// Create job
Job job = new Job();
job.setJobName("ManyTxtToFewSeqJob");
job.setJarByClass(ManyTxtToFewSeqJob.class);
// Define input format and path
job.setInputFormatClass(ConfigurableInputFormat.class);
ConfigurableInputFormat.setInputPath(job, inputPath);
ConfigurableInputFormat.setMapperNumber(job, Integer.parseInt(numberOfMappers));
// Define output format and path
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
if (compressionCodec.toLowerCase().equals("gzip")) {
SequenceFileOutputFormat.setOutputCompressorClass(job,
GzipCodec.class);
} else if (compressionCodec.toLowerCase().equals("bzip2")) {
SequenceFileOutputFormat.setOutputCompressorClass(job,
BZip2Codec.class);
} else {
SequenceFileOutputFormat.setOutputCompressorClass(job,
SnappyCodec.class);
}
SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
// Define the mapper and reducer
job.setMapperClass(ConsalidatorMapper.class);
// job.setReducerClass(Reducer.class);
// Define the key and value format
job.setOutputKeyClass(BytesWritable.class);
job.setOutputValueClass(BytesWritable.class);
job.setMapOutputKeyClass(BytesWritable.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setNumReduceTasks(0);
Configuration config = new Configuration();
FileSystem hdfs = FileSystem.get(config);
hdfs.delete(new Path(outputPath), true);
// Exit
job.waitForCompletion(true);
}
HadoopCompressionCodecs.java 文件源码
项目:eoulsan
阅读 22
收藏 0
点赞 0
评论 0
/**
* Create a bzip2 input stream.
* @param is input stream
* @return an uncompressed input stream
* @throws IOException if an error occurs while creating the input stream
*/
public static InputStream createBZip2InputStream(final InputStream is)
throws IOException {
return new BZip2Codec().createInputStream(is);
}
HadoopCompressionCodecs.java 文件源码
项目:eoulsan
阅读 21
收藏 0
点赞 0
评论 0
/**
* Create a bzip2 output stream.
* @param os the output stream to compress
* @return a compressed output stream
* @throws IOException if an error occurs while creating the output stream
*/
public static OutputStream createBZip2OutputStream(final OutputStream os)
throws IOException {
return new BZip2Codec().createOutputStream(os);
}