preprocess2.py 文件源码-python代码片段

def load_data_from_json2(path_to_json, test_split, vocabulary_size):
    '''
    Load data for training and testing from json file
    :param path_to_json: path to json file
    :param word2vec_dict: dictionary of word2vec
    :return: X_train, y_train, X_test, y_test
    '''
    X=[]
    y=[]
    len_sent_array=[]
    sample_weight=[]
    objests=read_json_file(path_to_json)
    print 'Data %d sentences'%len(objests)
    i=0
    original_sentence_array=[]
    compression_sentence_array=[]
    word2indext_dict, _ = word2index(objests, vocabulary_size)
    for object in objests:
        original_sentence, compression_sentence = get_originalSent_compressionSent(object)
        (array_sent, sample_w) = word2vec(original_sentence, word2indext_dict)
        X.append(array_sent)
        sample_weight.append(sample_w)
        (y_l,l) = label_compress(original_sentence, compression_sentence)
        y.append(y_l)
        len_sent_array.append(l)
        i+=1
        if i%100==0:
            sys.stdout.write('.')
        #get text array:
        original_sentence_array.append(original_sentence)
        compression_sentence_array.append(compression_sentence)
    return ((X[int(len(X)*test_split):],y[int(len(y)*test_split):], len_sent_array[int(len(len_sent_array)*test_split):], sample_weight[int(len(sample_weight)*test_split):]), (X[:int(len(X)*test_split)], y[:int(len(y)*test_split)], len_sent_array[:int(len(len_sent_array)*test_split)], sample_weight[:int(len(sample_weight)*test_split)]), (original_sentence_array, compression_sentence_array))