def _avgrank_corp(inp_dir,hdv_vocab, num = 5000):
cnt, vocab = Counter(), []
# Counter for all words in the corpus
for (root, dirs, files) in os.walk(inp_dir):
files = [f for f in files if not f[0] == '.']
for f in files:
filepath = os.path.join(root,f)
with codecs.open(filepath,'r', encoding="utf-8") as f:
tok_txt = word_tokenize(f.read())
for word in tok_txt: cnt[word] += 1
for word in hdv_vocab:
if word in cnt.keys(): del cnt[word]
for word in cnt.most_common(num):
try: vocab.append(str(word[0]))
except: continue
return vocab
评论列表
文章目录