def getCandidatesForLemma(lemma, min_size, max_size):
hits=[]
for match in ["phrase", "conjunct"]:
url="http://lotus.lodlaundromat.org/retrieve?size=" + str(max_size) + "&match=" + match + "&rank=psf&noblank=true&" + urllib.parse.urlencode({"string": lemma, "predicate": "label", "subject": "\"http://dbpedia.org/resource\""})
r = requests.get(url=url)
content = r.json()
these_hits=content["hits"]
hits=hits + these_hits
if content["numhits"]>=min_size or len(lemma.split(' '))==1:
break
subjects={}
for hit in hits:
lev_sim=Levenshtein.ratio(hit["string"].lower(), lemma.lower())
if "Disambiguation" not in hit["subject"].lower() and "Category" not in hit["subject"]:
if hit["subject"] not in subjects:
#subjects[hit["subject"]]=hit["length"]*len(lemma.split())
subjects[hit["subject"]]={"ss": lev_sim, "count": 1}
else:
subjects[hit["subject"]]["ss"]=max(subjects[hit["subject"]]["ss"], lev_sim)
subjects[hit["subject"]]["count"]+=1
return subjects
评论列表
文章目录