def getChList(docStrByte):
## ??????????????????????????????????
inputStr = str(docStrByte, encoding = 'gbk', errors = 'ignore').lower()#?????????????????
strList = ''.join(inputStr.split('\n'))#????????????????
rawTokens = list(jieba.tokenize(strList))#????
#stopWord ? ???????key ???????value??None
fSW = open('stopwords.txt', 'r', encoding = 'utf-8', errors = 'ignore').read()
stopWord = {}.fromkeys(fSW.split('\n'))
stopWord[''] = None
final = []
s = nltk.stem.SnowballStemmer('english')
for seg in rawTokens:
# print(seg[0].strip())
rawWord = seg[0].strip()#strip()?????????????
if (rawWord.isalpha()):#?????????????
word = s.stem(rawWord)
else:
word = rawWord
if word not in stopWord:#?????
final.append(word)#????list
return final
categorizing.py 文件源码
python
阅读 31
收藏 0
点赞 0
评论 0
评论列表
文章目录