load_data.py 文件源码-python代码片段

def get_train_data(language):

    # Load data from files
    path = "./data/" + language + "/"
    positive_examples = list(open(path + "rt-polarity.pos", "r").readlines())
    positive_examples = [s.strip() for s in positive_examples[:100]]   # -1000
    negative_examples = list(open(path + "rt-polarity.neg", "r").readlines())
    negative_examples = [s.strip() for s in negative_examples[:100]]

    x_text = positive_examples + negative_examples

    x_text = [sent for sent in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)

    # Build vocabulary
    max_length_of_sentence = max([len(jieba.lcut(x)) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_length_of_sentence)
    x = np.array(list(vocab_processor.fit_transform(x_text)))

    # Randomly shuffle data
    np.random.seed(1234)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/cross-validation set
    cross_validation_indices = np.array(random.sample(np.arange(len(y)), int(len(y) * 0.1) )) 
    train_indices = np.array(list(set(np.arange(len(y))) - set(cross_validation_indices)))

    x_train, x_dev = x_shuffled[train_indices], x_shuffled[cross_validation_indices]
    y_train, y_dev = y_shuffled[train_indices], y_shuffled[cross_validation_indices]

    return [x_train, x_dev, y_train, y_dev, vocab_processor]