def _split_train_tst(self):
"""
divide the data into training and testing data
Create the X_trn, X_tst, and Y_trn and Y_tst
Note that only the reviews are changed, and not the summary.
:return: None
"""
num_samples = self.Y.shape[0]
mapper_file = self.checkpointer.get_mapper_file_location()
if not self.checkpointer.is_mapper_checkpointed():
print 'No mapper checkpoint found. Fresh loading in progress ...'
# Now shuffle the data
sample_id = range(num_samples)
random.shuffle(sample_id)
print 'Dumping the mapper shuffle for reuse.'
Pickle.dump(sample_id, open(mapper_file, 'wb'))
print 'Dump complete. Moving Forward...'
else:
print 'Mapper Checkpoint found... Reading from mapper dump'
sample_id = Pickle.load(open(mapper_file, 'rb'))
print 'Mapping unpickling complete.. Moving forward...'
self.X = self.X[sample_id]
self.Y = self.Y[sample_id]
# Now divide the data into test ans train set
test_fraction = 0.01
self.test_size = int(test_fraction * num_samples)
self.train_size = num_samples - self.test_size
# review
self.X_trn = self.X[0:self.train_size]
self.X_tst = self.X[self.train_size:num_samples]
# Summary
self.Y_trn = self.Y[0:self.train_size]
self.Y_tst = self.Y[self.train_size:num_samples]
评论列表
文章目录