def generateStemmingDict(inputPath = 'stemmer.txt', outputPath = 'stemmingDict'):
inputEncoding = 'utf8'
outputEncoding = 'utf8'
distance = Levenshtein.ratio
fi = open(inputPath, 'r', encoding=inputEncoding)
fo = open(outputPath, 'w', encoding=outputEncoding)
stemmingDict = {}
for line in fi:
if line.strip() == '':
continue
tmpList = line.strip().split(' => ')
for word in tmpList[0].split(', '):
if word not in stemmingDict:
stemmingDict[word] = set()
stemmingDict[word].add(tmpList[1])
for key in stemmingDict:
stemmingDict[key] = list(stemmingDict[key])
for i in range(len(stemmingDict[key])):
stemmingDict[key][i] = [stemmingDict[key][i],distance(stemmingDict[key][i],key)]
json.dump(stemmingDict,fo)
fi.close()
fo.close()
fotxt = open(outputPath+'.txt', 'w', encoding=outputEncoding)
for key in stemmingDict:
fotxt.write(key + ' ' + str(stemmingDict[key]) + '\n')
fotxt.close()
评论列表
文章目录