analysis.py 文件源码-python代码片段

def make_data(file_name):
    '''Returns Tuple of dataframes used in analysis:
    core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df'''
    #realDonaldTrump_master_tweet_list.json

    #TODO: fix so strings aren't written to file and we can just load it as json.
    with open(file_name) as tfile:
        lines = tfile.readlines()
    raw_tweets_data =  [eval(t) for t in lines]

    analyzer = TextAnalyzer(raw_tweets_data)
    english_stopwords = stopwords.words("english")

    core_tweet_df = analyzer.make_tweet_df(
        with_pos_tags=False,
        columns_to_filter=['id', 'created_at', 'text', 'retweet_count', 'favorite_count'])

    # get list of tweets as text
    tweets_list = core_tweet_df.text.tolist()
    pos_df = analyzer.make_pos_df(tweets_list, make_csv=False)
    adj_df = pos_df[pos_df.pos_tag=='JJ']
    adj_df = analyzer.make_word_frequency_df(adj_df, 'word', make_csv=False)

    # calculate word frequencies among other words in data set. can't merge with pos
    # because certain words have many parts of speech. 
    word_frequency_df = analyzer.make_word_frequency_df(pos_df, 'word', make_csv=False)


    #Most common hashtags and total unique hashtags.
    all_hashtags = []
    for i in raw_tweets_data:
        all_hashtags.extend([d['text'] for d in i['entities']['hashtags']])
    fd = FreqDist(all_hashtags)

    hash_df = pd.DataFrame([{'hashtag':x,'abs_frequency': y, 'rel_frequency_pct': float(y)/len(all_hashtags)*100} for x,y in fd.most_common()])

    return core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df