def build_frequency_file(dtatcfdir, freq_file, MIN_FREQ, join_sign):
"""
Builds file with all lemma + POS pairs above certain frequency threshold.
:param dtatcfdir: path to directory with dta tcf files
:param freq_file: path to frequency file
:param MIN_FREQ: frequency threshold
:param join_sign: sign to join lemma + first char of POS
"""
# build frequency file from lemmas
outputpath = freq_file
print 'Building frequency file to ' + outputpath + "..."
lemma_count = Counter(build_lemma_list(dtatcfdir, join_sign))
frequent_lemmas = filter(lambda x: lemma_count[x] >= MIN_FREQ, lemma_count)
with open(outputpath, 'w') as f_out:
for lemma in frequent_lemmas:
print >> f_out, lemma.encode('utf-8')
评论列表
文章目录