word2vec_wiki.py 文件源码-python代码片段

word2vec_wiki.py 文件源码

python

阅读 21 收藏 0 点赞 0 评论 0

项目：Book_DeepLearning_Practice 作者: wac81 项目源码文件源码

def delNOTNeedWords(content,customstopwords=None):
    # words = jieba.lcut(content)
    if customstopwords == None:
        customstopwords = "stopwords.txt"
    import os
    if os.path.exists(customstopwords):
        stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n')
        customstopwords = stop_words

    result=''
    return_words = []
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
    words = pseg.lcut(content)

    for word, flag in words:
        # print word.encode('utf-8')
        tempword = word.encode('utf-8').strip(' ')
        if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']):
            # and flag[0] in [u'n', u'f', u'a', u'z']):
            # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
            result += tempword # +"/"+str(w.flag)+" "  #????
            return_words.append(tempword)
    return result,return_words