freq.py 文件源码-python代码片段

freq.py 文件源码

python

阅读 19 收藏 0 点赞 0 评论 0

项目：exquisite-corpus 作者: LuminosoInsight 项目源码文件源码

def freqs_to_cBpack(input_file, output_file, cutoff=600):
    """
    Convert a frequency list into the idiosyncratic 'cBpack' format that
    will be loaded by wordfreq: a list in msgpack format of frequency
    tiers, each tier being one centibel (a factor of 10^(1/100))
    less frequent than the previous tier.
    """
    cBpack = []
    for line in input_file:
        word, strfreq = line.rstrip().split('\t', 1)
        if word == '__total__':
            raise ValueError(
                "This is a count file, not a frequency file"
            )
        freq = float(strfreq)
        neg_cB = -(round(math.log10(freq) * 100))
        if neg_cB >= cutoff:
            break
        while neg_cB >= len(cBpack):
            cBpack.append([])
        cBpack[neg_cB].append(word)

    for sublist in cBpack:
        sublist.sort()

    cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack

    msgpack.dump(cBpack_data, output_file)