def load_utf8_data_and_labels(positive_data_file, negative_data_file):
# Load data from files
positive_data = list(codecs.open(positive_data_file, "r", encoding='utf-8').readlines())
positive_examples = list()
for s in positive_data:
positive_examples.append(" ".join(jieba.cut(s)))
negative_data = list(codecs.open(negative_data_file, "r", encoding='utf-8').readlines())
negative_examples = list()
for s in negative_data:
negative_examples.append(" ".join(jieba.cut(s)))
# Split by words
x_text = positive_examples + negative_examples
x_text = [clean_str(sent) for sent in x_text]
# Generate labels
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
y = np.concatenate([positive_labels, negative_labels], 0)
return [x_text, y]
评论列表
文章目录