LeaveOneOutValidator.java 文件源码

java
阅读 19 收藏 0 点赞 0 评论 0

项目:codebuff 作者:
public Triple<Formatter,Float,Float> validate(LangDescriptor language,
                                                  List<InputDocument> documents,
                                                  String fileToExclude,
                                                  int k,
                                                  FeatureMetaData[] injectWSFeatures,
                                                  FeatureMetaData[] alignmentFeatures,
                                                  String outputDir,
                                                  boolean computeEditDistance,
                                                  boolean collectAnalysis)
        throws Exception
    {
        final String path = new File(fileToExclude).getAbsolutePath();
        List<InputDocument> others = filter(documents, d -> !d.fileName.equals(path));
        List<InputDocument> excluded = filter(documents, d -> d.fileName.equals(path));
        assert others.size() == documents.size() - 1;
//      kNNClassifier.resetCache();
        if ( excluded.size()==0 ) {
            System.err.println("Doc not in corpus: "+path);
            return null;
        }
        InputDocument testDoc = excluded.get(0);
        long start = System.nanoTime();
        Corpus corpus = new Corpus(others, language);
        corpus.train();
        long stop = System.nanoTime();
        Formatter formatter = new Formatter(corpus, language.indentSize, k, injectWSFeatures, alignmentFeatures);
        InputDocument originalDoc = testDoc;
        long format_start = System.nanoTime();
        String output = formatter.format(testDoc, collectAnalysis);
        long format_stop = System.nanoTime();
        float editDistance = 0;
        if ( computeEditDistance ) {
            editDistance = normalizedLevenshteinDistance(testDoc.content, output);
        }
        ClassificationAnalysis analysis = new ClassificationAnalysis(originalDoc, formatter.getAnalysisPerToken());
        System.out.println(testDoc.fileName+": edit distance = "+editDistance+", error rate = "+analysis.getErrorRate());
        if ( outputDir!=null ) {
            File dir = new File(outputDir+"/"+language.name+"/"+Tool.version);
            if ( !dir.exists() ) {
                dir.mkdirs();
            }
            Utils.writeFile(dir.getPath()+"/"+new File(testDoc.fileName).getName(), output);
        }
        long tms = (stop - start) / 1_000_000;
        long fms = (format_stop - format_start) / 1_000_000;
        trainingTimes.add((double)tms);
        float tokensPerMS = testDoc.tokens.size() / (float) fms;
        formattingTokensPerMS.add((double)tokensPerMS);
        System.out.printf("Training time = %d ms, formatting %d ms, %5.3f tokens/ms (%d tokens)\n",
                          tms,
                          fms,
                          tokensPerMS, testDoc.tokens.size());
//      System.out.printf("classify calls %d, hits %d rate %f\n",
//                        kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits,
//                        kNNClassifier.nClassifyCacheHits/(float) kNNClassifier.nClassifyCalls);
//      System.out.printf("kNN calls %d, hits %d rate %f\n",
//                        kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits,
//                        kNNClassifier.nNNCacheHits/(float) kNNClassifier.nNNCalls);
        return new Triple<>(formatter, editDistance, analysis.getErrorRate());
    }
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号