/** Compute a document difference metric 0-1.0 between two documents that
* are identical other than (likely) the whitespace and comments.
*
* 1.0 means the docs are maximally different and 0 means docs are identical.
*
* The Levenshtein distance between the docs counts only
* whitespace diffs as the non-WS content is identical.
* Levenshtein distance is bounded by 0..max(len(doc1),len(doc2)) so
* we normalize the distance by dividing by max WS count.
*
* TODO: can we simplify this to a simple walk with two
* cursors through the original vs formatted counting
* mismatched whitespace? real text are like anchors.
*/
public static double docDiff(String original,
String formatted,
Class<? extends Lexer> lexerClass)
throws Exception
{
// Grammar must strip all but real tokens and whitespace (and put that on hidden channel)
CodeBuffTokenStream original_tokens = Tool.tokenize(original, lexerClass);
// String s = original_tokens.getText();
CodeBuffTokenStream formatted_tokens = Tool.tokenize(formatted, lexerClass);
// String t = formatted_tokens.getText();
// walk token streams and examine whitespace in between tokens
int i = -1;
int ws_distance = 0;
int original_ws = 0;
int formatted_ws = 0;
while ( true ) {
Token ot = original_tokens.LT(i); // TODO: FIX THIS! can't use LT()
if ( ot==null || ot.getType()==Token.EOF ) break;
List<Token> ows = original_tokens.getHiddenTokensToLeft(ot.getTokenIndex());
original_ws += tokenText(ows).length();
Token ft = formatted_tokens.LT(i); // TODO: FIX THIS! can't use LT()
if ( ft==null || ft.getType()==Token.EOF ) break;
List<Token> fws = formatted_tokens.getHiddenTokensToLeft(ft.getTokenIndex());
formatted_ws += tokenText(fws).length();
ws_distance += whitespaceEditDistance(tokenText(ows), tokenText(fws));
i++;
}
// it's probably ok to ignore ws diffs after last real token
int max_ws = Math.max(original_ws, formatted_ws);
double normalized_ws_distance = ((float) ws_distance)/max_ws;
return normalized_ws_distance;
}
Dbg.java 文件源码
java
阅读 15
收藏 0
点赞 0
评论 0
项目:codebuff
作者:
评论列表
文章目录