java类org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner的实例源码-面圈网

HFileOutputFormat2.java 文件源码项目：ditb 阅读 29 收藏 0 点赞 0 评论 0

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
 * <code>splitPoints</code>. Cleans up the partitions file after job exists.
 */
static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
    throws IOException {
  Configuration conf = job.getConfiguration();
  // create the partitions file
  FileSystem fs = FileSystem.get(conf);
  String hbaseTmpFsDir =
      conf.get(HConstants.TEMPORARY_FS_DIRECTORY_KEY,
        HConstants.DEFAULT_TEMPORARY_HDFS_DIRECTORY);
  Path partitionsPath = new Path(hbaseTmpFsDir, "partitions_" + UUID.randomUUID());
  fs.makeQualified(partitionsPath);
  writePartitions(conf, partitionsPath, splitPoints);
  fs.deleteOnExit(partitionsPath);

  // configure job to use it
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(conf, partitionsPath);
}

HadoopFileSorter.java 文件源码项目：ldbc_snb_datagen 阅读 33 收藏 0 点赞 0 评论 0

/**
 * Sorts a hadoop sequence file
 *
 * @param inputFileName  The name of the file to sort.
 * @param outputFileName The name of the sorted file.
 * @throws Exception
 */
public void run(String inputFileName, String outputFileName) throws Exception {
    int numThreads = conf.getInt("ldbc.snb.datagen.generator.numThreads", 1);
    Job job = Job.getInstance(conf, "Sorting " + inputFileName);

    FileInputFormat.setInputPaths(job, new Path(inputFileName));
    FileOutputFormat.setOutputPath(job, new Path(outputFileName));

    job.setMapOutputKeyClass(K);
    job.setMapOutputValueClass(V);
    job.setOutputKeyClass(K);
    job.setOutputValueClass(V);
    job.setNumReduceTasks(numThreads);

    job.setJarByClass(V);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    InputSampler.Sampler sampler = new InputSampler.RandomSampler(0.1, 1000);
    TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path(inputFileName + "_partition.lst"));
    InputSampler.writePartitionFile(job, sampler);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    if (!job.waitForCompletion(true)) {
        throw new Exception();
    }
}

CubeHFileJob.java 文件源码项目：kylin 阅读 33 收藏 0 点赞 0 评论 0

/**
 * Check if there's partition files for hfile, if yes replace the table splits, to make the job more reducers
 * @param conf the job configuration
 * @param path the hfile partition file
 * @throws IOException
 */
@SuppressWarnings("deprecation")
private void reconfigurePartitions(Configuration conf, Path path) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    if (fs.exists(path)) {
        try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf)) {
            int partitionCount = 0;
            Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
            Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
            while (reader.next(key, value)) {
                partitionCount++;
            }
            TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), path);
            // The reduce tasks should be one more than partition keys
            job.setNumReduceTasks(partitionCount + 1);
        }
    } else {
        logger.info("File '" + path.toString() + " doesn't exist, will not reconfigure hfile Partitions");
    }
}

HashTable.java 文件源码项目：hbase 阅读 33 收藏 0 点赞 0 评论 0

public Job createSubmittableJob(String[] args) throws IOException {
  Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
  generatePartitions(partitionsPath);

  Job job = Job.getInstance(getConf(),
        getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
  Configuration jobConf = job.getConfiguration();
  jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
  job.setJarByClass(HashTable.class);

  TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
      HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);

  // use a TotalOrderPartitioner and reducers to group region output into hash files
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
  job.setReducerClass(Reducer.class);  // identity reducer
  job.setNumReduceTasks(tableHash.numHashFiles);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(ImmutableBytesWritable.class);
  job.setOutputFormatClass(MapFileOutputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));

  return job;
}

HFileOutputFormat2.java 文件源码项目：hbase 阅读 45 收藏 0 点赞 0 评论 0

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
 * <code>splitPoints</code>. Cleans up the partitions file after job exists.
 */
static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints, boolean
        writeMultipleTables)
    throws IOException {
  Configuration conf = job.getConfiguration();
  // create the partitions file
  FileSystem fs = FileSystem.get(conf);
  String hbaseTmpFsDir =
      conf.get(HConstants.TEMPORARY_FS_DIRECTORY_KEY,
        HConstants.DEFAULT_TEMPORARY_HDFS_DIRECTORY);
  Path partitionsPath = new Path(hbaseTmpFsDir, "partitions_" + UUID.randomUUID());
  fs.makeQualified(partitionsPath);
  writePartitions(conf, partitionsPath, splitPoints, writeMultipleTables);
  fs.deleteOnExit(partitionsPath);

  // configure job to use it
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(conf, partitionsPath);
}

HashTable.java 文件源码项目：ditb 阅读 39 收藏 0 点赞 0 评论 0

public Job createSubmittableJob(String[] args) throws IOException {
  Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
  generatePartitions(partitionsPath);

  Job job = Job.getInstance(getConf(),
        getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
  Configuration jobConf = job.getConfiguration();
  jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
  job.setJarByClass(HashTable.class);

  TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
      HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);

  // use a TotalOrderPartitioner and reducers to group region output into hash files
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
  job.setReducerClass(Reducer.class);  // identity reducer
  job.setNumReduceTasks(tableHash.numHashFiles);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(ImmutableBytesWritable.class);
  job.setOutputFormatClass(MapFileOutputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));

  return job;
}

IntegrationTestImportTsv.java 文件源码项目：ditb 阅读 44 收藏 0 点赞 0 评论 0

/**
 * Confirm the absence of the {@link TotalOrderPartitioner} partitions file.
 */
protected static void validateDeletedPartitionsFile(Configuration conf) throws IOException {
  if (!conf.getBoolean(IntegrationTestingUtility.IS_DISTRIBUTED_CLUSTER, false))
    return;

  FileSystem fs = FileSystem.get(conf);
  Path partitionsFile = new Path(TotalOrderPartitioner.getPartitionFile(conf));
  assertFalse("Failed to clean up partitions file.", fs.exists(partitionsFile));
}

SortUilts.java 文件源码项目：SOAPgaea 阅读 32 收藏 0 点赞 0 评论 0

public static void configureSampling(Path outPath, BioJob job, VCFSortOptions options) throws IOException{
    Configuration conf = job.getConfiguration();
    final Path partition = outPath.getFileSystem(conf).makeQualified(new Path(outPath, "_partitioning" + "VCF"));

    TotalOrderPartitioner.setPartitionFile(conf, partition);
    try {
        final URI partitionURI = new URI(partition.toString() + "#" + partition.getName());

        if(partitionURI.getScheme().equals("file"))
            return;

        ReferenceShare.distributeCache(partitionURI.toString(), job);
    } catch (URISyntaxException e) { throw new RuntimeException(e); }

}

HFileOutputFormat2.java 文件源码项目：pbase 阅读 35 收藏 0 点赞 0 评论 0

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
 * <code>splitPoints</code>. Cleans up the partitions file after job exists.
 */
static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
    throws IOException {
  Configuration conf = job.getConfiguration();
  // create the partitions file
  FileSystem fs = FileSystem.get(conf);
  Path partitionsPath = new Path(conf.get("hadoop.tmp.dir"), "partitions_" + UUID.randomUUID());
  fs.makeQualified(partitionsPath);
  writePartitions(conf, partitionsPath, splitPoints);
  fs.deleteOnExit(partitionsPath);

  // configure job to use it
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(conf, partitionsPath);
}

HFileOutputFormat2.java 文件源码项目：HIndex 阅读 30 收藏 0 点赞 0 评论 0

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
 * <code>splitPoints</code>. Cleans up the partitions file after job exists.
 */
static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
    throws IOException {

  // create the partitions file
  FileSystem fs = FileSystem.get(job.getConfiguration());
  Path partitionsPath = new Path("/tmp", "partitions_" + UUID.randomUUID());
  fs.makeQualified(partitionsPath);
  fs.deleteOnExit(partitionsPath);
  writePartitions(job.getConfiguration(), partitionsPath, splitPoints);

  // configure job to use it
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);
}

IntegrationTestImportTsv.java 文件源码项目：HIndex 阅读 37 收藏 0 点赞 0 评论 0

/**
 * Confirm the absence of the {@link TotalOrderPartitioner} partitions file.
 */
protected static void validateDeletedPartitionsFile(Configuration conf) throws IOException {
  if (!conf.getBoolean(IntegrationTestingUtility.IS_DISTRIBUTED_CLUSTER, false))
    return;

  FileSystem fs = FileSystem.get(conf);
  Path partitionsFile = new Path(TotalOrderPartitioner.getPartitionFile(conf));
  assertFalse("Failed to clean up partitions file.", fs.exists(partitionsFile));
}

BulkOutputFormat.java 文件源码项目：htools 阅读 31 收藏 0 点赞 0 评论 0

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning
 * against <code>splitPoints</code>. Cleans up the partitions file after job
 * exists.
 */
static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
        throws IOException {

    // create the partitions file
    FileSystem fs = FileSystem.get(job.getConfiguration());
    Path partitionsPath = new Path("/tmp", "partitions_" + UUID.randomUUID());
    fs.makeQualified(partitionsPath);
    fs.deleteOnExit(partitionsPath);
    writePartitions(job.getConfiguration(), partitionsPath, splitPoints);

    // configure job to use it
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);
}

HFileOutputFormat3.java 文件源码项目：kylin 阅读 27 收藏 0 点赞 0 评论 0

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
 * <code>splitPoints</code>. Cleans up the partitions file after job exists.
 */
static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
        throws IOException {
    Configuration conf = job.getConfiguration();
    // create the partitions file
    FileSystem fs = FileSystem.get(conf);
    Path partitionsPath = new Path(conf.get("hbase.fs.tmp.dir"), "partitions_" + UUID.randomUUID());
    fs.makeQualified(partitionsPath);
    writePartitions(conf, partitionsPath, splitPoints);
    fs.deleteOnExit(partitionsPath);

    // configure job to use it
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TotalOrderPartitioner.setPartitionFile(conf, partitionsPath);
}

IntegrationTestImportTsv.java 文件源码项目：hbase 阅读 39 收藏 0 点赞 0 评论 0

/**
 * Confirm the absence of the {@link TotalOrderPartitioner} partitions file.
 */
protected static void validateDeletedPartitionsFile(Configuration conf) throws IOException {
  if (!conf.getBoolean(IntegrationTestingUtility.IS_DISTRIBUTED_CLUSTER, false))
    return;

  FileSystem fs = FileSystem.get(conf);
  Path partitionsFile = new Path(TotalOrderPartitioner.getPartitionFile(conf));
  assertFalse("Failed to clean up partitions file.", fs.exists(partitionsFile));
}

HFileOutputFormat2.java 文件源码项目：PyroDB 阅读 27 收藏 0 点赞 0 评论 0

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
 * <code>splitPoints</code>. Cleans up the partitions file after job exists.
 */
static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
    throws IOException {

  // create the partitions file
  FileSystem fs = FileSystem.get(job.getConfiguration());
  Path partitionsPath = new Path("/tmp", "partitions_" + UUID.randomUUID());
  fs.makeQualified(partitionsPath);
  fs.deleteOnExit(partitionsPath);
  writePartitions(job.getConfiguration(), partitionsPath, splitPoints);

  // configure job to use it
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);
}

IntegrationTestImportTsv.java 文件源码项目：PyroDB 阅读 38 收藏 0 点赞 0 评论 0

/**
 * Confirm the absence of the {@link TotalOrderPartitioner} partitions file.
 */
protected static void validateDeletedPartitionsFile(Configuration conf) throws IOException {
  if (!conf.getBoolean(IntegrationTestingUtility.IS_DISTRIBUTED_CLUSTER, false))
    return;

  FileSystem fs = FileSystem.get(conf);
  Path partitionsFile = new Path(TotalOrderPartitioner.getPartitionFile(conf));
  assertFalse("Failed to clean up partitions file.", fs.exists(partitionsFile));
}

HFileOutputFormat.java 文件源码项目：c5 阅读 29 收藏 0 点赞 0 评论 0

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
 * <code>splitPoints</code>. Cleans up the partitions file after job exists.
 */
static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
    throws IOException {

  // create the partitions file
  FileSystem fs = FileSystem.get(job.getConfiguration());
  Path partitionsPath = new Path("/tmp", "partitions_" + UUID.randomUUID());
  fs.makeQualified(partitionsPath);
  fs.deleteOnExit(partitionsPath);
  writePartitions(job.getConfiguration(), partitionsPath, splitPoints);

  // configure job to use it
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);
}

TeraSort.java 文件源码项目：TeraSort-Local-Hadoop-MR-Spark 阅读 76 收藏 0 点赞 0 评论 0

@Override
public int run(String[] newargs) throws Exception {

    Job job = Job.getInstance(new Configuration());
    job.setJarByClass(TeraSort.class);

    // set mapper and reducer class
    job.setMapperClass(TeraMapper.class);
    job.setReducerClass(TeraReducer.class);

    // set number of reducers
    job.setNumReduceTasks(32);

    job.setInputFormatClass(KeyValueTextInputFormat.class);

    // set output of map class text as key
    job.setMapOutputKeyClass(Text.class);

    // set output of reducer as text class as key and value both are Text
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // set input path for the job
    FileInputFormat.addInputPath(job, new Path(newargs[0]));

    Path partitionFile = new Path(new Path(newargs[2]), "partitioning");
    TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);

    // use random sampler to write partitioner file
    InputSampler.Sampler<Text, Text> sampler = new InputSampler.RandomSampler<>(0.01, 1000,32);
    InputSampler.writePartitionFile(job, sampler);

    // set partitioner to TotalOrderPartitioner
    job.setPartitionerClass(TotalOrderPartitioner.class);

    // set output directory for the job
    FileOutputFormat.setOutputPath(job, new Path(newargs[1]));

    int ret = job.waitForCompletion(true) ? 0 : 1;
    logger.info("Done");
    return ret;

}

HDTBuilderDriver.java 文件源码项目：hdt-mr 阅读 32 收藏 0 点赞 0 评论 0

protected boolean runDictionaryJob() throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
    boolean jobOK;
    Job job = null;
    BufferedWriter bufferedWriter;

    // if output path exists...
    if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
        if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
            this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
            System.out.println("Select other path or use option -dd to overwrite");
            System.exit(-1);
        }
    }

    // Sample the SequenceInputFormat to do TotalSort and create final output
    job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2");

    job.setJarByClass(HDTBuilderDriver.class);

    System.out.println("samples = " + this.conf.getDictionarySamplesPath());
    System.out.println("output = " + this.conf.getDictionaryOutputPath());

    FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath());
    FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    // Identity Mapper
    // job.setMapperClass(Mapper.class);
    job.setCombinerClass(DictionaryCombiner.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setReducerClass(DictionaryReducer.class);

    job.setNumReduceTasks(this.conf.getDictionaryReducers());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    System.out.println("Sampling started");
    InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
    String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
    URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());
    System.out.println("Sampling finished");

    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
    this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
    this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
    this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();

    bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));

    bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");

    bufferedWriter.close();

    return jobOK;
}

HDTBuilderDriver.java 文件源码项目：hdt-mr 阅读 43 收藏 0 点赞 0 评论 0

protected boolean runTriplesJob() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
    Job job = null;
    boolean jobOK;

    // if triples output path exists...
    if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
        if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
            this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
            System.out.println("Select other path or use option -dt to overwrite");
            System.exit(-1);
        }
    }

    job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 2");

    job.setJarByClass(HDTBuilderDriver.class);

    FileInputFormat.addInputPath(job, this.conf.getTriplesSamplesPath());
    FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    job.setSortComparatorClass(TripleSPOComparator.class);
    job.setGroupingComparatorClass(TripleSPOComparator.class);

    job.setPartitionerClass(TotalOrderPartitioner.class);

    job.setOutputKeyClass(TripleSPOWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(this.conf.getTriplesReducers());

    System.out.println("Sampling started");
    InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
    String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
    URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());
    System.out.println("Sampling finished");

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    return jobOK;
}

SortDriver.java 文件源码项目：tree-index 阅读 26 收藏 0 点赞 0 评论 0

public int run(String[] args) throws Exception {
    SortConfig config = new SortConfig();
    config.fromArray(args);

    Job job = Job.getInstance(getConf());
    job.setJobName("sort");
    job.setJarByClass(SortDriver.class);

    // define the path
    Path inputPath = new Path(config.getInput());
    Path partitionFilePath = new Path(config.getPartition());
    Path outputPath = new Path(config.getOutput());
    Path metaPath = new Path(config.getMeta());
    LOGGER.info("use " + inputPath.toString() + " as sort input");
    LOGGER.info("use " + partitionFilePath.toString() + " as partition");
    LOGGER.info("use " + outputPath.toString() + " as sort output");
    LOGGER.info("use " + metaPath.toString() + " as meta output");

    // define the mapper
    // use the identity mapper, which is the default implementation
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.setInputPaths(job, inputPath);

    // define the reducer
    job.getConfiguration().set(SortReducer.META_BASE_CONFIG_NAME, metaPath.toString());
    job.setReducerClass(SortReducer.class);
    job.setNumReduceTasks(NUM_REDUCER);
    // use text for debug, use sequence is faster I guess
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    TextOutputFormat.setOutputPath(job, outputPath);

    // set partition
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFilePath);

    // set the sampler
    InputSampler.writePartitionFile(job, new InputSampler.RandomSampler(
            1, 10000));

    // set multiple output
    MultipleOutputs.addNamedOutput(job, "meta", TextOutputFormat.class,
            IntWritable.class, Text.class);

    // clean up the old output path
    outputPath.getFileSystem(job.getConfiguration()).delete(outputPath, true);
    metaPath.getFileSystem(job.getConfiguration()).delete(metaPath, true);

    // run the job and wait until it complete
    return job.waitForCompletion(true) ? 0 : 1;
}

TotalSortMapReduce.java 文件源码项目：hiped2 阅读 37 收藏 0 点赞 0 评论 0

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  int numReducers = 2;

  Cli cli = Cli.builder().setArgs(args).addOptions(CliOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path input = new Path(cli.getArgValueAsString(CliOpts.INPUT));
  Path partitionFile = new Path(cli.getArgValueAsString(CliOpts.PARTITION));
  Path output = new Path(cli.getArgValueAsString(CliOpts.OUTPUT));


  InputSampler.Sampler<Text, Text> sampler =
      new InputSampler.RandomSampler<Text, Text>
          (0.1,
              10000,
              10);

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(TotalSortMapReduce.class);

  job.setNumReduceTasks(numReducers);

  job.setInputFormatClass(KeyValueTextInputFormat.class);
  job.setOutputFormatClass(TextOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);

  TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
  FileInputFormat.setInputPaths(job, input);
  FileOutputFormat.setOutputPath(job, output);

  InputSampler.writePartitionFile(job, sampler);

  URI partitionUri = new URI(partitionFile.toString() +
      "#" + "_sortPartitioning");
  DistributedCache.addCacheFile(partitionUri, conf);

  if (job.waitForCompletion(true)) {
    return 0;
  }
  return 1;
}

HadoopFileRanker.java 文件源码项目：ldbc_snb_datagen 阅读 28 收藏 0 点赞 0 评论 0

/**
 * Sorts a hadoop sequence file
 *
 * @param inputFileName  The name of the file to sort.
 * @param outputFileName The name of the sorted file.
 * @throws Exception
 */
public void run(String inputFileName, String outputFileName) throws Exception {
    int numThreads = conf.getInt("ldbc.snb.datagen.generator.numThreads", 1);

    if (keySetterName != null) {
        conf.set("keySetterClassName", keySetterName);
    }

    /** First Job to sort the key-value pairs and to count the number of elements processed by each reducer.**/
    Job jobSort = Job.getInstance(conf, "Sorting " + inputFileName);

    FileInputFormat.setInputPaths(jobSort, new Path(inputFileName));
    FileOutputFormat
            .setOutputPath(jobSort, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/rankIntermediate"));

    if (keySetterName != null) {
        jobSort.setMapperClass(HadoopFileRankerSortMapper.class);
    }
    jobSort.setMapOutputKeyClass(K);
    jobSort.setMapOutputValueClass(V);
    jobSort.setOutputKeyClass(BlockKey.class);
    jobSort.setOutputValueClass(V);
    jobSort.setNumReduceTasks(numThreads);
    jobSort.setReducerClass(HadoopFileRankerSortReducer.class);
    jobSort.setJarByClass(V);
    jobSort.setInputFormatClass(SequenceFileInputFormat.class);
    jobSort.setOutputFormatClass(SequenceFileOutputFormat.class);
    InputSampler.Sampler sampler = new InputSampler.RandomSampler(0.1, 1000);
    TotalOrderPartitioner.setPartitionFile(jobSort.getConfiguration(), new Path(inputFileName + "_partition.lst"));
    InputSampler.writePartitionFile(jobSort, sampler);
    jobSort.setPartitionerClass(TotalOrderPartitioner.class);
    if (!jobSort.waitForCompletion(true)) {
        throw new Exception();
    }

    /** Second Job to assign the rank to each element.**/
    Job jobRank = Job.getInstance(conf, "Sorting " + inputFileName);
    FileInputFormat
            .setInputPaths(jobRank, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/rankIntermediate"));
    FileOutputFormat.setOutputPath(jobRank, new Path(outputFileName));

    jobRank.setMapOutputKeyClass(BlockKey.class);
    jobRank.setMapOutputValueClass(V);
    jobRank.setOutputKeyClass(LongWritable.class);
    jobRank.setOutputValueClass(V);
    jobRank.setSortComparatorClass(BlockKeyComparator.class);
    jobRank.setNumReduceTasks(numThreads);
    jobRank.setReducerClass(HadoopFileRankerFinalReducer.class);
    jobRank.setJarByClass(V);
    jobRank.setInputFormatClass(SequenceFileInputFormat.class);
    jobRank.setOutputFormatClass(SequenceFileOutputFormat.class);
    jobRank.setPartitionerClass(HadoopFileRankerPartitioner.class);
    if (!jobRank.waitForCompletion(true)) {
        throw new Exception();
    }

    try {
        FileSystem fs = FileSystem.get(conf);
        for (int i = 0; i < numThreads; ++i) {
            fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/rank_" + i), true);
        }
        fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/rankIntermediate"), true);
    } catch (IOException e) {
        System.err.println(e.getMessage());
    }
}

BulkImportJobExample.java 文件源码项目：hbase-in-action 阅读 39 收藏 0 点赞 0 评论 0

/**
 * Fixed a potential overlap of generated regions / splits for a dataset with lots of identical keys. For instance,
 * let your samples be: {1,1,1 ,1,3,3, 3,5,6} and your number of partitions be 3. Original implementation will get you
 * following splits, 1-1, 3-3, 3-6, notice the overlap between 2nd and 3rd partition.
 *
 * @param job
 * @param sampler
 * @param <K>
 * @param <V>
 * @throws IOException
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
@SuppressWarnings("unchecked")
public static <K, V> void writePartitionFile(Job job, InputSampler.Sampler<K, V> sampler)
  throws IOException, ClassNotFoundException, InterruptedException {
  LinkedList<K> splits = new LinkedList<K>();
  Configuration conf = job.getConfiguration();
  final InputFormat inf =
    ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
  int numPartitions = job.getNumReduceTasks();
  K[] samples = null; //sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator = (RawComparator<K>) job.getGroupingComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
  FileSystem fs = dst.getFileSystem(conf);
  if (fs.exists(dst)) fs.delete(dst, false);
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;

  K lastKey = null;
  K currentKey = null;
  int lastKeyIndex = -1;
  for (int i = 1; i < numPartitions; ++i) {
    int currentKeyOffset = Math.round(stepSize * i);
    if (lastKeyIndex > currentKeyOffset) {
      long keyOffset = lastKeyIndex - currentKeyOffset;
      float errorRate = keyOffset / samples.length;
      LOG.warn(
        String.format("Partitions overlap. Consider using a different Sampler " +
          "and/or increase the number of samples and/or use more splits to take samples from. " +
          "Next sample would have been %s key overlaps by a distance of %d (factor %f) ", samples[currentKeyOffset], keyOffset, errorRate));
      currentKeyOffset = lastKeyIndex + 1;
    }
    currentKey = samples[currentKeyOffset];

    while (lastKey != null && comparator.compare(currentKey, lastKey) == 0) {
      currentKeyOffset++;
      if (currentKeyOffset >= samples.length) {
        LOG.info("Last 10 elements:");

        for (int d = samples.length - 1; d > samples.length - 11; d--) {
          LOG.debug(samples[d]);
        }
        throw new IOException("Not enough samples, stopped at partition " + i);
      }
      currentKey = samples[currentKeyOffset];
    }

    writer.append(currentKey, nullValue);
    lastKey = currentKey;
    lastKeyIndex = currentKeyOffset;
    splits.add(currentKey);
  }
  writer.close();
  LOG.info("*********************************************  ");
  LOG.info(" START KEYs for new Regions:  ");
  for (K split : splits) {
    LOG.info("* " + split.toString());
  }

}

BulkImportJobExample.java 文件源码项目：hbase-in-action 阅读 39 收藏 0 点赞 0 评论 0

public static Job createSubmittableJob(Configuration conf, String[] args)
  throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
  Path inputDir = new Path(args[0]);
  Path outputDir = new Path(args[1]);
  boolean createPartitionFile = Boolean.parseBoolean(args[2]);

  Job job = Job.getInstance(conf,
    "Import delicious RSS feed into Hush tables.");
  job.setJarByClass(BulkImportJobExample.class);

  job.setInputFormatClass(TextInputFormat.class);
  // conf.setLong("hbase.hregion.max.filesize", 64 * 1024);
  FileInputFormat.setInputPaths(job, inputDir);

  job.setMapperClass(BulkImportMapper.class);
  job.setMapOutputKeyClass(ImmutableBytesWritable.class);
  job.setMapOutputValueClass(Put.class);

  job.setPartitionerClass(TotalOrderPartitioner.class);

  job.setReducerClass(PutSortReducer.class);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(KeyValue.class);

  job.setOutputFormatClass(HFileOutputFormat.class);
  HFileOutputFormat.setOutputPath(job, outputDir);

  HFileOutputFormat.setCompressOutput(job, true);
  HFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
  job.getConfiguration().set("hfile.compression", "gz");

  //job.getConfiguration().setFloat("mapred.job.shuffle.input.buffer.percent", 0.5f);
  //job.setNumReduceTasks(30);

  Path partitionsPath = new Path(job.getWorkingDirectory(),
    "partitions_" + System.currentTimeMillis());
  TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);

  if (createPartitionFile) {
    VerboseInputSampler.Sampler<KeyValue, ImmutableBytesWritable> sampler =
      new VerboseInputSampler.VerboseRandomSampler<KeyValue, ImmutableBytesWritable>(0.05, 1000000, 30);       // use 0.1 for real sampling

    LOG.info("Sampling key space");
    VerboseInputSampler.writePartitionFile(job, sampler);
    LOG.info("Samping done");
  }

  URI cacheUri = new URI(partitionsPath.toString() + "#" +
    TotalOrderPartitioner.DEFAULT_PATH);
  DistributedCache.addCacheFile(cacheUri, job.getConfiguration());
  DistributedCache.createSymlink(job.getConfiguration());

  return job;
}

HadoopFileRanker.java 文件源码项目：ldbc_snb_datagen_deprecated2015 阅读 27 收藏 0 点赞 0 评论 0

/** Sorts a hadoop sequence file
 *
 * @param inputFileName The name of the file to sort.
 * @param outputFileName The name of the sorted file.
 * @throws Exception
 */
public void run( String inputFileName, String outputFileName ) throws Exception {
    int numThreads = conf.getInt("numThreads",1);

    /** First Job to sort the key-value pairs and to count the number of elements processed by each reducer.**/
    Job jobSort = new Job(conf, "Sorting "+inputFileName);

    FileInputFormat.setInputPaths(jobSort, new Path(inputFileName));
    FileOutputFormat.setOutputPath(jobSort, new Path(conf.get("outputDir")+"/hadoop/rankIntermediate"));

    jobSort.setMapOutputKeyClass(K);
    jobSort.setMapOutputValueClass(V);
    jobSort.setOutputKeyClass(ComposedKey.class);
    jobSort.setOutputValueClass(V);
    jobSort.setNumReduceTasks(numThreads);
    jobSort.setReducerClass(HadoopFileRankerSortReducer.class);
    jobSort.setJarByClass(V);
    jobSort.setInputFormatClass(SequenceFileInputFormat.class);
    jobSort.setOutputFormatClass(SequenceFileOutputFormat.class);
    InputSampler.Sampler sampler = new InputSampler.RandomSampler(0.1, 1000);
    TotalOrderPartitioner.setPartitionFile(jobSort.getConfiguration(), new Path(inputFileName + "_partition.lst"));
    InputSampler.writePartitionFile(jobSort, sampler);
    jobSort.setPartitionerClass(TotalOrderPartitioner.class);
    jobSort.waitForCompletion(true);

    /** Second Job to assign the rank to each element.**/
    Job jobRank = new Job(conf, "Sorting "+inputFileName);
    FileInputFormat.setInputPaths(jobRank, new Path(conf.get("outputDir")+"/hadoop/rankIntermediate"));
    FileOutputFormat.setOutputPath(jobRank, new Path(outputFileName));

    jobRank.setMapOutputKeyClass(ComposedKey.class);
    jobRank.setMapOutputValueClass(V);
    jobRank.setOutputKeyClass(LongWritable.class);
    jobRank.setOutputValueClass(V);
    jobRank.setSortComparatorClass(ComposedKeyComparator.class);
    jobRank.setNumReduceTasks(numThreads);
    jobRank.setReducerClass(HadoopFileRankerFinalReducer.class);
    jobRank.setJarByClass(V);
    jobRank.setInputFormatClass(SequenceFileInputFormat.class);
    jobRank.setOutputFormatClass(SequenceFileOutputFormat.class);
    jobRank.setPartitionerClass(HadoopFileRankerPartitioner.class);
    jobRank.waitForCompletion(true);

    try{
        FileSystem fs = FileSystem.get(conf);
        for(int i = 0; i < numThreads;++i ) {
            fs.delete(new Path(conf.get("outputDir")+"/hadoop/rank_"+i),true);
        }
        fs.delete(new Path(conf.get("outputDir")+"/hadoop/rankIntermediate"),true);
    } catch(IOException e) {
        System.err.println(e.getMessage());
    }
}

TotalOrderSortingStage.java 文件源码项目：hadoop-map-reduce-patterns 阅读 29 收藏 0 点赞 0 评论 0

@SuppressWarnings("unchecked")
@Override
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Path inputPath = new Path(args[0]);
    Path partitionFile = new Path(args[1] + "_partitions.lst");
    Path outputStage = new Path(args[1] + "_staging");
    Path outputOrder = new Path(args[1]);
    // Configure job to prepare for sampling
    Job sampleJob = new Job(conf, "TotalOrderSortingStage");
    sampleJob.setJarByClass(TotalOrderSortingStage.class);
    // Use the mapper implementation with zero reduce tasks
    sampleJob.setMapperClass(LastAccessMapper.class);
    sampleJob.setNumReduceTasks(0);
    sampleJob.setOutputKeyClass(Text.class);
    sampleJob.setOutputValueClass(Text.class);
    TextInputFormat.setInputPaths(sampleJob, inputPath);
    // Set the output format to a sequence file
    sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(sampleJob, outputStage);
    // Submit the job and get completion code.
    int code = sampleJob.waitForCompletion(true) ? 0 : 1;
    if (code == 0) {
        Job orderJob = new Job(conf, "TotalOrderSortingStage");
        orderJob.setJarByClass(TotalOrderSortingStage.class);
        // Here, use the identity mapper to output the key/value pairs in
        // the SequenceFile
        orderJob.setMapperClass(Mapper.class);
        orderJob.setReducerClass(ValuesReducer.class);
        // Set the number of reduce tasks to an appropriate number for the
        // amount of data being sorted
        orderJob.setNumReduceTasks(10);
        // Use Hadoop's TotalOrderPartitioner class
        orderJob.setPartitionerClass(TotalOrderPartitioner.class);
        // Set the partition file
        TotalOrderPartitioner.setPartitionFile(orderJob.getConfiguration(),
                partitionFile);
        orderJob.setOutputKeyClass(Text.class);
        orderJob.setOutputValueClass(Text.class);
        // Set the input to the previous job's output
        orderJob.setInputFormatClass(SequenceFileInputFormat.class);
        SequenceFileInputFormat.setInputPaths(orderJob, outputStage);
        // Set the output path to the command line parameter
        TextOutputFormat.setOutputPath(orderJob, outputOrder);
        // Set the separator to an empty string
        orderJob.getConfiguration().set(
                "mapred.textoutputformat.separator", "");
        // Use the InputSampler to go through the output of the previous
        // job, sample it, and create the partition file
        InputSampler.writePartitionFile(orderJob,
                new InputSampler.RandomSampler(.001, 10000));
        // Submit the job
        code = orderJob.waitForCompletion(true) ? 0 : 2;
    }
    // Clean up the partition file and the staging directory
    FileSystem.get(new Configuration()).delete(partitionFile, false);
    FileSystem.get(new Configuration()).delete(outputStage, true);
    return code;
}

HFileOutputFormat.java 文件源码项目：DominoHBase 阅读 30 收藏 0 点赞 0 评论 0

/**
 * Configure a MapReduce Job to perform an incremental load into the given
 * table. This
 * <ul>
 *   <li>Inspects the table to configure a total order partitioner</li>
 *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
 *   <li>Sets the number of reduce tasks to match the current number of regions</li>
 *   <li>Sets the output key/value class to match HFileOutputFormat's requirements</li>
 *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
 *     PutSortReducer)</li>
 * </ul>
 * The user should be sure to set the map output value class to either KeyValue or Put before
 * running this function.
 */
public static void configureIncrementalLoad(Job job, HTable table)
throws IOException {
  Configuration conf = job.getConfiguration();

  job.setPartitionerClass(TotalOrderPartitioner.class);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(KeyValue.class);
  job.setOutputFormatClass(HFileOutputFormat.class);

  // Based on the configured map output class, set the correct reducer to properly
  // sort the incoming values.
  // TODO it would be nice to pick one or the other of these formats.
  if (KeyValue.class.equals(job.getMapOutputValueClass())) {
    job.setReducerClass(KeyValueSortReducer.class);
  } else if (Put.class.equals(job.getMapOutputValueClass())) {
    job.setReducerClass(PutSortReducer.class);
  } else {
    LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
  }

  LOG.info("Looking up current regions for table " + table);
  List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
  LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
      "to match current region count");
  job.setNumReduceTasks(startKeys.size());

  Path partitionsPath = new Path(job.getWorkingDirectory(),
                                 "partitions_" + UUID.randomUUID());
  LOG.info("Writing partition information to " + partitionsPath);

  FileSystem fs = partitionsPath.getFileSystem(conf);
  writePartitions(conf, partitionsPath, startKeys);
  partitionsPath.makeQualified(fs);

  URI cacheUri;
  try {
    cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
  } catch (URISyntaxException e) {
    throw new IOException(e);
  }
  DistributedCache.addCacheFile(cacheUri, conf);
  DistributedCache.createSymlink(conf);

  // Set compression algorithms based on column families
  configureCompression(table, conf);
  configureBloomType(table, conf);

  TableMapReduceUtil.addDependencyJars(job);
  LOG.info("Incremental table output configured.");
}