def parseTweetSet(tweets_data_path):
tweets_text = []
tweets_file = open(tweets_data_path, "r")
english_stopwords_set = set(stopwords.words('english'))
for line in tweets_file:
tweet = json.loads(line)
text = tweet['text']
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
words_set = set(words)
common_elements = words_set.intersection(english_stopwords_set)
if (len(common_elements)>2):
tweets_text.append(tweet['text'])
tweets_text_set = set(tweets_text)
#print len(tweets_text)
#print len(tweets_text_set)
#print tweets_text_set
return list(tweets_text_set)
analyzeTweets.py 文件源码
python
阅读 27
收藏 0
点赞 0
评论 0
评论列表
文章目录