def delNOTNeedWords(content,customstopwords=None):
# words = jieba.lcut(content)
if customstopwords == None:
customstopwords = "stopwords.txt"
import os
if os.path.exists(customstopwords):
stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n')
customstopwords = stop_words
result=''
return_words = []
# for w in words:
# if w not in stopwords:
# result += w.encode('utf-8') # +"/"+str(w.flag)+" " #????
words = pseg.lcut(content)
for word, flag in words:
# print word.encode('utf-8')
tempword = word.encode('utf-8').strip(' ')
if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']):
# and flag[0] in [u'n', u'f', u'a', u'z']):
# ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
result += tempword # +"/"+str(w.flag)+" " #????
return_words.append(tempword)
return result,return_words
评论列表
文章目录