def preprocess_tweets( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True):
"""
Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace
"""
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(preserve_case = False, strip_handles=True, reduce_len=True)
def custom_tokenizer( s ):
# need to manually replace quotes
s = s.replace("'"," ").replace('"',' ')
tokens = []
for x in tweet_tokenizer.tokenize(s):
if len(x) >= min_term_length:
if x[0] == "#" or x[0].isalpha():
tokens.append( x )
return tokens
# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
if apply_norm:
norm_function = "l2"
else:
norm_function = None
tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range)
X = tfidf.fit_transform(docs)
terms = []
# store the vocabulary map
v = tfidf.vocabulary_
for i in range(len(v)):
terms.append("")
for term in v.keys():
terms[ v[term] ] = term
return (X,terms)
# --------------------------------------------------------------
评论列表
文章目录