def load_data(self):
logging.info('loading the dataset from %s' %self.data_home)
train_file = os.path.join(self.data_home, 'user_info.train.gz')
dev_file = os.path.join(self.data_home, 'user_info.dev.gz')
test_file = os.path.join(self.data_home, 'user_info.test.gz')
df_train = pd.read_csv(train_file, delimiter='\t', encoding=self.encoding, names=['user', 'lat', 'lon', 'text'], quoting=csv.QUOTE_NONE, error_bad_lines=False)
df_dev = pd.read_csv(dev_file, delimiter='\t', encoding=self.encoding, names=['user', 'lat', 'lon', 'text'], quoting=csv.QUOTE_NONE, error_bad_lines=False)
df_test = pd.read_csv(test_file, delimiter='\t', encoding=self.encoding, names=['user', 'lat', 'lon', 'text'], quoting=csv.QUOTE_NONE, error_bad_lines=False)
df_train.dropna(inplace=True)
df_dev.dropna(inplace=True)
df_test.dropna(inplace=True)
df_train['user'] = df_train['user'].apply(lambda x: str(x).lower())
df_train.drop_duplicates(['user'], inplace=True, keep='last')
df_train.set_index(['user'], drop=True, append=False, inplace=True)
df_train.sort_index(inplace=True)
df_dev['user'] = df_dev['user'].apply(lambda x: str(x).lower())
df_dev.drop_duplicates(['user'], inplace=True, keep='last')
df_dev.set_index(['user'], drop=True, append=False, inplace=True)
df_dev.sort_index(inplace=True)
df_test['user'] = df_test['user'].apply(lambda x: str(x).lower())
df_test.drop_duplicates(['user'], inplace=True, keep='last')
df_test.set_index(['user'], drop=True, append=False, inplace=True)
df_test.sort_index(inplace=True)
self.df_train = df_train
self.df_dev = df_dev
self.df_test = df_test
评论列表
文章目录