public static float[] getMedianErrorRates(LangDescriptor language, int maxNumFiles, int trials) throws Exception {
SubsetValidator validator = new SubsetValidator(language.corpusDir, language);
List<InputDocument> documents = Tool.load(validator.allFiles, language);
float[] medians = new float[Math.min(documents.size(),maxNumFiles)+1];
int ncpu = Runtime.getRuntime().availableProcessors();
if ( FORCE_SINGLE_THREADED ) {
ncpu = 2;
}
ExecutorService pool = Executors.newFixedThreadPool(ncpu-1);
List<Callable<Void>> jobs = new ArrayList<>();
for (int i = 1; i<=Math.min(validator.allFiles.size(), maxNumFiles); i++) { // i is corpus subset size
final int corpusSubsetSize = i;
Callable<Void> job = () -> {
try {
List<Float> errorRates = new ArrayList<>();
for (int trial = 1; trial<=trials; trial++) { // multiple trials per subset size
Pair<InputDocument, List<InputDocument>> sample = validator.selectSample(documents, corpusSubsetSize);
Triple<Formatter, Float, Float> results = validate(language, sample.b, sample.a, true, false);
// System.out.println(sample.a.fileName+" n="+corpusSubsetSize+": error="+results.c);
// System.out.println("\tcorpus =\n\t\t"+Utils.join(sample.b.iterator(), "\n\t\t"));
errorRates.add(results.c);
}
Collections.sort(errorRates);
int n = errorRates.size();
float median = errorRates.get(n/2);
System.out.println("median "+language.name+" error rate for n="+corpusSubsetSize+" is "+median);
medians[corpusSubsetSize] = median;
}
catch (Throwable t) {
t.printStackTrace(System.err);
}
return null;
};
jobs.add(job);
}
pool.invokeAll(jobs);
pool.shutdown();
boolean terminated = pool.awaitTermination(60, TimeUnit.MINUTES);
return medians;
}
SubsetValidator.java 文件源码
java
阅读 21
收藏 0
点赞 0
评论 0
项目:codebuff
作者:
评论列表
文章目录