def load_data_labels(data_file, labels_file):
"""
Loads MR polarity data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
data = []
labels = []
with open(data_file, 'r', encoding='latin-1') as f:
data.extend([s.strip() for s in f.readlines()])
data = [clean_str(s) for s in data]
with open(labels_file, 'r') as f:
labels.extend([s.strip() for s in f.readlines()])
lables = [label.split(',')[1].strip() for label in labels]
lb = LabelBinarizer()
y = lb.fit_transform(lables)
# max_document_length = max([len(x.split(" ")) for x in data])
# print(max_document_length)
vocab_processor = learn.preprocessing.VocabularyProcessor(1000)
x = np.array(list(vocab_processor.fit_transform(data)))
return x, y, vocab_processor
评论列表
文章目录