def getFreqWords(directoryPath):
files = getListOfFilesInDir(directoryPath, "*") # get list of files in directory
allWords = []
count = 0
if MAX_FILES_PER_CLASS > 0 and MAX_FILES_PER_CLASS < len(files):
files = random.sample(files, MAX_FILES_PER_CLASS)
for ifile, fi in enumerate(files): # for each file in current class:
with open(fi) as f:
content = f.read()
words = word_tokenize(content.decode('utf-8'))
words = [w.lower() for w in words if w.lower() not in stop]
words = list(set(words))
allWords += words
count += 1
#print allWords
C = Counter(allWords)
C = sorted(C.items(), key=itemgetter(1),reverse=True)
for c in C:
if c[1] > 0.05 * float(count):
print c[0], c[1] / float(count)
utility_getFreqWords.py 文件源码
python
阅读 28
收藏 0
点赞 0
评论 0
评论列表
文章目录