def get_top_candidates(self, source):
# First, compute n-grams for all lines in the source file
src_ng = n_grams.NGrams()
src_ng.parse_text_list_items(
source.lines,
universe_ng=self.license_library.universe_n_grams)
# Measure n-gram similarity relative to all licenses in the library
similarities = OrderedDict()
for license_name, lic in iteritems(self.license_library.licenses):
similarity_score = lic.n_grams.measure_similarity(src_ng)
similarities[license_name] = similarity_score
# Filter out low-scoring licenses
best_score = max(similarities.values())
current_threshold = max(self.threshold,
best_score * self.keep_fraction_of_best)
top_candidates = OrderedDict()
for license_name, score in iteritems(similarities):
if score >= current_threshold:
top_candidates[license_name] = score
return top_candidates
评论列表
文章目录