def tweet_stemming(tweet, token_freqs):
"""
Stems tweets words and counts diversty
:param tweet: the tweet to analyze
:type tweet: str or unicode
:param token_freqs: counter of words frequency
:type token_freqs: Counter
:returns: words added to token_freqs
:rtype: int
"""
pattern_url = '((https?:\/\/)|www\.)([\da-z\.-]+)\.([\/\w \.-]*)( |$)'
regex_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
porter = PorterStemmer()
counter_tokens = 0
tweet_url_removed = re.sub(pattern_url, '', tweet, flags=re.MULTILINE) # remove URL
tweet_url_removed_tokenized = word_tokenize(tweet_url_removed) # tokenize tweet
tweet_url_removed_tokenized_cleaned_stemming = [] # cleaned of URLs and hashs, and stemming
for token in tweet_url_removed_tokenized:
new_token = regex_punctuation.sub(u'', token) # remove punctuation and hash
if not new_token == u'':
new_token_stemming = porter.stem(new_token)
tweet_url_removed_tokenized_cleaned_stemming.append(new_token_stemming)
token_freqs[new_token_stemming] += 1
counter_tokens += 1
return counter_tokens
评论列表
文章目录