def process_data(self, input_file, onto_aware, for_test=False):
'''
Reads an input file and makes input for training or testing.
'''
dataset_type = "test" if for_test else "training"
print >>sys.stderr, "Reading %s data" % dataset_type
label_ind = []
tagged_sentences = []
max_sentence_length = 0
all_sentence_lengths = []
for line in open(input_file):
lnstrp = line.strip()
label, tagged_sentence = lnstrp.split("\t")
sentence_length = len(tagged_sentence.split())
all_sentence_lengths.append(sentence_length)
if sentence_length > max_sentence_length:
max_sentence_length = sentence_length
label_ind.append(int(label))
tagged_sentences.append(tagged_sentence)
if for_test:
if not self.model:
raise RuntimeError("Model not trained yet!")
input_shape = self.model.get_input_shape_at(0) # (num_sentences, num_words, ...)
sentlenlimit = input_shape[1]
else:
sentlenlimit = max_sentence_length
# We need to readjust the labels because padding would affect the sentence indices.
for i in range(len(label_ind)):
length = all_sentence_lengths[i]
label_ind[i] += sentlenlimit - length
if not for_test:
# Shuffling so that when Keras does validation split, it is not always at the end.
sentences_and_labels = zip(tagged_sentences, label_ind)
random.shuffle(sentences_and_labels)
tagged_sentences, label_ind = zip(*sentences_and_labels)
print >>sys.stderr, "Indexing %s data" % dataset_type
inputs = self.data_processor.prepare_input(tagged_sentences, onto_aware=onto_aware,
sentlenlimit=sentlenlimit, for_test=for_test,
remove_singletons=False)
labels = self.data_processor.make_one_hot(label_ind)
return inputs, labels
评论列表
文章目录