util.py 文件源码-python代码片段

def preprocess_tweets( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True):
    """
    Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace
    """
    from nltk.tokenize import TweetTokenizer
    tweet_tokenizer = TweetTokenizer(preserve_case = False, strip_handles=True, reduce_len=True)

    def custom_tokenizer( s ):
        # need to manually replace quotes
        s = s.replace("'"," ").replace('"',' ')
        tokens = []
        for x in tweet_tokenizer.tokenize(s):
            if len(x) >= min_term_length:
                if x[0] == "#" or x[0].isalpha():
                    tokens.append( x )
        return tokens

    # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
    if apply_norm:
        norm_function = "l2"
    else:
        norm_function = None
    tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
    X = tfidf.fit_transform(docs)
    terms = []
    # store the vocabulary map
    v = tfidf.vocabulary_
    for i in range(len(v)):
        terms.append("")
    for term in v.keys():
        terms[ v[term] ] = term
    return (X,terms)

# --------------------------------------------------------------