def doc_to_ids(self, doc, training=True):
l = []
words = dict()
doc_sents = sent_tokenize(doc)
for sentence in doc_sents:
miniArray = []
for term in sentence.split():
id = self.term_to_id(term, training)
if id != None:
miniArray.append(id)
if not id in words:
words[id] = 1
self.docfreq[id] += 1 # It counts in how many documents a word appears. If it appears in only a few, remove it from the vocabulary using cut_low_freq()
l.append(np.array(miniArray, dtype=np.int32))
return l
vocabulary_sentenceLayer.py 文件源码
python
阅读 20
收藏 0
点赞 0
评论 0
评论列表
文章目录