spark_nounfiltersort.py 文件源码-python代码片段

spark_nounfiltersort.py 文件源码

python

阅读 19 收藏 0 点赞 0 评论 0

项目：noungroups 作者: gushecht 项目源码文件源码

def main(in_dir, out_dir):
    sc = ps.SparkContext()
    text_files = sc.textFile(in_dir)
    counts = text_files.flatMap(lambda line: line.split(' ')) \
                       .filter(lambda word: any(label in word for label in LABELS)) \
                       .map(lambda word: (word, 1)) \
                       .reduceByKey(add) \
                       .persist(storageLevel=ps.StorageLevel.MEMORY_AND_DISK)
    total_nouns = counts.values() \
                        .reduce(add)
    sorted_nouns = counts.map(lambda (word, count): (word, count / float(total_nouns))) \
                         .sortBy(lambda (word, count): count, ascending=False) \
                         .collect()
    with open(path.join(out_dir, 'sorted_nouns.txt'), 'w+') as f:
        for word in sorted_nouns:
            f.write(str(word) + '\n')