util.py 文件源码

python
阅读 22 收藏 0 点赞 0 评论 0

项目:redbiom 作者: biocore 项目源码 文件源码
def df_to_stems(df):
    """Convert a DataFrame to stem -> index associations

    Parameters
    ----------
    df : pd.DataFrame
        A pandas DataFrame to index

    Returns
    -------
    dict
        {stem: {set of indices}}
    """
    from collections import defaultdict
    import functools
    import nltk

    # not using nltk default as we want this to be portable so that, for
    # instance, a javascript library can query
    stemmer = nltk.PorterStemmer(nltk.PorterStemmer.MARTIN_EXTENSIONS)

    stops = frozenset(nltk.corpus.stopwords.words('english'))
    stem_f = functools.partial(stems, stops, stemmer)

    d = defaultdict(set)

    for sample, row in df.iterrows():
        for value in row.values:
            for stem in stem_f(value):
                d[stem].add(sample)

    return dict(d)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号