def preprocess( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True, lemmatize = False ):
"""
Preprocess a list containing text documents stored as strings.
"""
token_pattern = re.compile(r"\b\w\w+\b", re.U)
if lemmatize:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
def normalize( x ):
x = x.lower()
if lemmatize:
return wnl.lemmatize(x)
return x
def custom_tokenizer( s ):
return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ]
# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
if apply_norm:
norm_function = "l2"
else:
norm_function = None
tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range)
X = tfidf.fit_transform(docs)
terms = []
# store the vocabulary map
v = tfidf.vocabulary_
for i in range(len(v)):
terms.append("")
for term in v.keys():
terms[ v[term] ] = term
return (X,terms)
评论列表
文章目录