def load_data_for_books(path):
text = ''.join(open(path).readlines()).decode('utf8')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
book = tokenizer.tokenize(text)
#book = re.split(r' *[\.\?!][\'"\)\]]* *', text)
#book = list(open(path, "r").readlines())
book = [s.strip() for s in book]
book = [clean_str(sent) for sent in book]
book = [s.split(" ") for s in book]
x_text = book
y = np.vstack([np.zeros(len(book)),np.ones(len(book))]).T
sentences, labels = x_text,y
sentences_padded = pad_sentences(sentences)
sentencesT, labelsT = load_data_and_labels()
sentences_paddedT = pad_sentences(sentencesT)
vocabulary, vocabulary_inv = build_vocab(sentences_paddedT)
x, y = build_input_data(sentences_padded, labels, vocabulary)
return [x, y, vocabulary, vocabulary_inv, sentencesT]
评论列表
文章目录