public Triple<Formatter,Float,Float> validate(LangDescriptor language,
List<InputDocument> documents,
String fileToExclude,
int k,
FeatureMetaData[] injectWSFeatures,
FeatureMetaData[] alignmentFeatures,
String outputDir,
boolean computeEditDistance,
boolean collectAnalysis)
throws Exception
{
final String path = new File(fileToExclude).getAbsolutePath();
List<InputDocument> others = filter(documents, d -> !d.fileName.equals(path));
List<InputDocument> excluded = filter(documents, d -> d.fileName.equals(path));
assert others.size() == documents.size() - 1;
// kNNClassifier.resetCache();
if ( excluded.size()==0 ) {
System.err.println("Doc not in corpus: "+path);
return null;
}
InputDocument testDoc = excluded.get(0);
long start = System.nanoTime();
Corpus corpus = new Corpus(others, language);
corpus.train();
long stop = System.nanoTime();
Formatter formatter = new Formatter(corpus, language.indentSize, k, injectWSFeatures, alignmentFeatures);
InputDocument originalDoc = testDoc;
long format_start = System.nanoTime();
String output = formatter.format(testDoc, collectAnalysis);
long format_stop = System.nanoTime();
float editDistance = 0;
if ( computeEditDistance ) {
editDistance = normalizedLevenshteinDistance(testDoc.content, output);
}
ClassificationAnalysis analysis = new ClassificationAnalysis(originalDoc, formatter.getAnalysisPerToken());
System.out.println(testDoc.fileName+": edit distance = "+editDistance+", error rate = "+analysis.getErrorRate());
if ( outputDir!=null ) {
File dir = new File(outputDir+"/"+language.name+"/"+Tool.version);
if ( !dir.exists() ) {
dir.mkdirs();
}
Utils.writeFile(dir.getPath()+"/"+new File(testDoc.fileName).getName(), output);
}
long tms = (stop - start) / 1_000_000;
long fms = (format_stop - format_start) / 1_000_000;
trainingTimes.add((double)tms);
float tokensPerMS = testDoc.tokens.size() / (float) fms;
formattingTokensPerMS.add((double)tokensPerMS);
System.out.printf("Training time = %d ms, formatting %d ms, %5.3f tokens/ms (%d tokens)\n",
tms,
fms,
tokensPerMS, testDoc.tokens.size());
// System.out.printf("classify calls %d, hits %d rate %f\n",
// kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits,
// kNNClassifier.nClassifyCacheHits/(float) kNNClassifier.nClassifyCalls);
// System.out.printf("kNN calls %d, hits %d rate %f\n",
// kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits,
// kNNClassifier.nNNCacheHits/(float) kNNClassifier.nNNCalls);
return new Triple<>(formatter, editDistance, analysis.getErrorRate());
}
LeaveOneOutValidator.java 文件源码
java
阅读 19
收藏 0
点赞 0
评论 0
项目:codebuff
作者:
评论列表
文章目录