def compute_tf(data, stopwords_list, language, use_lemmer=True, min_df=2, max_df=0.8):
"""
Compute the tf matrix for the provided data
:param language: 'en' or 'it'
:param data:
:param stopwords_list:
:param use_lemmer:
:param min_df:
:param max_df:
:return:
"""
lemmer_tokenizer = None
if use_lemmer:
if language == 'it':
lemmer_tokenizer = LemNormalizeIt
else:
lemmer_tokenizer = LemNormalize
min_df = min_df if len(data) > min_df else 1
max_df = max_df if max_df * len(data) >= min_df else 1.0
# tf
tf_vectorizer = CountVectorizer(tokenizer=lemmer_tokenizer,
max_df=max_df, min_df=min_df,
max_features=None,
stop_words=stopwords_list,
token_pattern="[a-zA-Z]{3,}")
try:
tf = tf_vectorizer.fit_transform(data)
tf_features_names = tf_vectorizer.get_feature_names()
except:
logging.warning('The computed tf matrix is empty. Check stopwords.')
tf = []
tf_features_names = []
return tf, tf_features_names
评论列表
文章目录