nlp_utils.py 文件源码-python代码片段

nlp_utils.py 文件源码

python

阅读 23 收藏 0 点赞 0 评论 0

项目：search_relevance 作者: rmanak 项目源码文件源码

def getBOW(token_pattern = token_pattern,
           max_df = bow__max_df,
           min_df = bow__min_df,
           ngram_range = (1, 1),
           vocabulary = None,
           stop_words = 'english'):
    bow =CountVectorizer(min_df=min_df, max_df=max_df, max_features=None, 
                         strip_accents='unicode', analyzer='word', 
                         token_pattern=token_pattern,
                         ngram_range=ngram_range,
                         stop_words = stop_words, vocabulary=vocabulary)
    return bow     


########################################################

# ------------------------------
# Simple text cleaning using 
#        
#     -replacement dict 
#        
#        or
#        
#     -WordReplacer object
#--------------------------------