def make_data(file_name):
'''Returns Tuple of dataframes used in analysis:
core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df'''
#realDonaldTrump_master_tweet_list.json
#TODO: fix so strings aren't written to file and we can just load it as json.
with open(file_name) as tfile:
lines = tfile.readlines()
raw_tweets_data = [eval(t) for t in lines]
analyzer = TextAnalyzer(raw_tweets_data)
english_stopwords = stopwords.words("english")
core_tweet_df = analyzer.make_tweet_df(
with_pos_tags=False,
columns_to_filter=['id', 'created_at', 'text', 'retweet_count', 'favorite_count'])
# get list of tweets as text
tweets_list = core_tweet_df.text.tolist()
pos_df = analyzer.make_pos_df(tweets_list, make_csv=False)
adj_df = pos_df[pos_df.pos_tag=='JJ']
adj_df = analyzer.make_word_frequency_df(adj_df, 'word', make_csv=False)
# calculate word frequencies among other words in data set. can't merge with pos
# because certain words have many parts of speech.
word_frequency_df = analyzer.make_word_frequency_df(pos_df, 'word', make_csv=False)
#Most common hashtags and total unique hashtags.
all_hashtags = []
for i in raw_tweets_data:
all_hashtags.extend([d['text'] for d in i['entities']['hashtags']])
fd = FreqDist(all_hashtags)
hash_df = pd.DataFrame([{'hashtag':x,'abs_frequency': y, 'rel_frequency_pct': float(y)/len(all_hashtags)*100} for x,y in fd.most_common()])
return core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df
评论列表
文章目录