def df_to_stems(df):
"""Convert a DataFrame to stem -> index associations
Parameters
----------
df : pd.DataFrame
A pandas DataFrame to index
Returns
-------
dict
{stem: {set of indices}}
"""
from collections import defaultdict
import functools
import nltk
# not using nltk default as we want this to be portable so that, for
# instance, a javascript library can query
stemmer = nltk.PorterStemmer(nltk.PorterStemmer.MARTIN_EXTENSIONS)
stops = frozenset(nltk.corpus.stopwords.words('english'))
stem_f = functools.partial(stems, stops, stemmer)
d = defaultdict(set)
for sample, row in df.iterrows():
for value in row.values:
for stem in stem_f(value):
d[stem].add(sample)
return dict(d)
评论列表
文章目录