def freqs_to_cBpack(input_file, output_file, cutoff=600):
"""
Convert a frequency list into the idiosyncratic 'cBpack' format that
will be loaded by wordfreq: a list in msgpack format of frequency
tiers, each tier being one centibel (a factor of 10^(1/100))
less frequent than the previous tier.
"""
cBpack = []
for line in input_file:
word, strfreq = line.rstrip().split('\t', 1)
if word == '__total__':
raise ValueError(
"This is a count file, not a frequency file"
)
freq = float(strfreq)
neg_cB = -(round(math.log10(freq) * 100))
if neg_cB >= cutoff:
break
while neg_cB >= len(cBpack):
cBpack.append([])
cBpack[neg_cB].append(word)
for sublist in cBpack:
sublist.sort()
cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack
msgpack.dump(cBpack_data, output_file)
评论列表
文章目录