def load_data_from_json2(path_to_json, test_split, vocabulary_size):
'''
Load data for training and testing from json file
:param path_to_json: path to json file
:param word2vec_dict: dictionary of word2vec
:return: X_train, y_train, X_test, y_test
'''
X=[]
y=[]
len_sent_array=[]
sample_weight=[]
objests=read_json_file(path_to_json)
print 'Data %d sentences'%len(objests)
i=0
original_sentence_array=[]
compression_sentence_array=[]
word2indext_dict, _ = word2index(objests, vocabulary_size)
for object in objests:
original_sentence, compression_sentence = get_originalSent_compressionSent(object)
(array_sent, sample_w) = word2vec(original_sentence, word2indext_dict)
X.append(array_sent)
sample_weight.append(sample_w)
(y_l,l) = label_compress(original_sentence, compression_sentence)
y.append(y_l)
len_sent_array.append(l)
i+=1
if i%100==0:
sys.stdout.write('.')
#get text array:
original_sentence_array.append(original_sentence)
compression_sentence_array.append(compression_sentence)
return ((X[int(len(X)*test_split):],y[int(len(y)*test_split):], len_sent_array[int(len(len_sent_array)*test_split):], sample_weight[int(len(sample_weight)*test_split):]), (X[:int(len(X)*test_split)], y[:int(len(y)*test_split)], len_sent_array[:int(len(len_sent_array)*test_split)], sample_weight[:int(len(sample_weight)*test_split)]), (original_sentence_array, compression_sentence_array))
评论列表
文章目录