/**
* Runs the link analysis job. The link analysis job applies the link rank
* formula to create a score per url and stores that score in the NodeDb.
*
* Typically the link analysis job is run a number of times to allow the link
* rank scores to converge.
*
* @param nodeDb
* The node database from which we are getting previous link rank
* scores.
* @param inverted
* The inverted inlinks
* @param output
* The link analysis output.
* @param iteration
* The current iteration number.
* @param numIterations
* The total number of link analysis iterations
*
* @throws IOException
* If an error occurs during link analysis.
*/
private void runAnalysis(Path nodeDb, Path inverted, Path output,
int iteration, int numIterations, float rankOne) throws IOException {
JobConf analyzer = new NutchJob(getConf());
analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1));
analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1)
+ " of " + numIterations);
FileInputFormat.addInputPath(analyzer, nodeDb);
FileInputFormat.addInputPath(analyzer, inverted);
FileOutputFormat.setOutputPath(analyzer, output);
analyzer.set("link.analyze.rank.one", String.valueOf(rankOne));
analyzer.setMapOutputKeyClass(Text.class);
analyzer.setMapOutputValueClass(ObjectWritable.class);
analyzer.setInputFormat(SequenceFileInputFormat.class);
analyzer.setMapperClass(Analyzer.class);
analyzer.setReducerClass(Analyzer.class);
analyzer.setOutputKeyClass(Text.class);
analyzer.setOutputValueClass(Node.class);
analyzer.setOutputFormat(MapFileOutputFormat.class);
analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
false);
LOG.info("Starting analysis job");
try {
JobClient.runJob(analyzer);
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
LOG.info("Finished analysis job.");
}
LinkRank.java 文件源码
java
阅读 27
收藏 0
点赞 0
评论 0
项目:GeoCrawler
作者:
评论列表
文章目录