stsTwitterCorpus.py 文件源码-python代码片段

def getFeatures(numWordsToUse, allTweets, allTweetsSentiment):
    # each corpus's getFeatures function is responsible for somehow loading in their own allTweets and allTweetsSentiment data
    # then they have to ensure that data is tokenized (leveraging the modular tokenization functionality in utils)
    # then shuffle the dataset
    # then create the frequency distribution and popularWords
    # then extract features from each tweet, and un-combine the sentiment again


    global popularWords
    formattedTweets, sentiment, popularWords = utils.nlpFeatureEngineering(
            allTweets, allTweetsSentiment,0,numWordsToUse,'counts'
        )

    # right now we have a data structure roughly equivalent to a dense matrix, except each row is a dictionary
    # DictVectorizer performs two key functions for us:
        # 1. transforms each row from a dictionary into a vector using consistent placing of keys into indexed positions within each vector
        # 2. returns sparse vectors, saving enormous amounts of memory which becomes very useful when training our models
    sparseFeatures = dv.fit_transform(formattedTweets)

    return sparseFeatures, sentiment