def doc_to_ids(self, doc, training=True):
l = []
words = dict()
window = 150
# doc = doc.replace("–", " ")
# doc = sent_tokenize(doc)
for sentence in doc:
miniArray = []
for term in sentence:
id = self.term_to_id(term, training)
if id != None:
miniArray.append(id)
if not id in words:
words[id] = 1
self.docfreq[id] += 1
if not len(miniArray):
continue
if len(miniArray) > window:
l.extend([np.array(miniArray[i:i+window]) for i in xrange(0, len(miniArray), window)])
else:
l.append(np.array(miniArray))
return l
评论列表
文章目录