utils.py 文件源码-python代码片段

def tokenize(tweets, sentiment):

    # NLTK has a tokenizer built out specifically for short messaging data
    # here we will use some of it's features to:
     # turn all words to lowercase,
    # reduce the length of repeated characters ('hiiiiiiiii' and 'hiiiii' both become 'hiii' with three repeats of the 'i'),
    # and get rid of any handles that might exist in the message
    tokenizer = TweetTokenizer(preserve_case=False,reduce_len=True,strip_handles=True)

    tokenizedTweets = []
    cleanedSentiment = []

    asciiIssues = 0
    for rowIdx, tweet in enumerate(tweets):
        try:
            tokenizedWords = tokenizer.tokenize(tweet)
            tokenizedTweets.append(tokenizedWords)
            cleanedSentiment.append(sentiment[rowIdx])

        except:
            # there are some weird ascii encoding issues present in a small part of our dataset. 
            # they represent < 1% of our dataset
            # for MVP, i'm going to ignore them to focus on the 99% use case
            # these issues do not exist in the test data set, so it is safe to ignore these rows
            asciiIssues += 1  

    return tokenizedTweets, cleanedSentiment


# some algorithms do not train well on ordered data. This function shuffles our data so we don't have one big block of positive documents followed by another large block of negative documents