def get_twitter_sentiment_multilabel_classification_dataset():
file_name = os.path.join('tests', 'twitter_sentiment.h5')
try:
df_twitter = pd.read_hdf(file_name)
except Exception as e:
print('Error')
print(e)
dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
# Do not write the index that pandas automatically creates
df_twitter.to_hdf(file_name, key='df', format='fixed')
# Grab only 10% of the dataset- runs much faster this way
df_twitter = df_twitter.sample(frac=0.1)
df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)
df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
return df_twitter_train, df_twitter_test
评论列表
文章目录