stop_words.py 文件源码

python
阅读 22 收藏 0 点赞 0 评论 0

项目:gfan 作者: hozuki 项目源码 文件源码
def read_all_stop_words() -> Set[str]:
    # Data source: https://wenku.baidu.com/view/7ca26338376baf1ffc4fad6a.html
    with open("data/chinese_stop_words.txt", mode="r", encoding="utf-8") as local_file:
        text_lines = local_file.readlines()
        text_lines = list(x.replace("\n", "") for x in text_lines)

    with open("data/chinese_stop_symbols.txt", mode="r", encoding="utf-8") as local_file:
        symbol_lines = local_file.readlines()
        symbol_lines = list(x.replace("\n", "") for x in symbol_lines)

    public_stop_words = get_stop_words("zh")

    stop_words: Set[str] = set()
    stop_words = stop_words.union(text_lines)
    stop_words = stop_words.union(symbol_lines)
    stop_words = stop_words.union(public_stop_words)

    return stop_words
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号