categorizing.py 文件源码-python代码片段

categorizing.py 文件源码

python

阅读 31 收藏 0 点赞 0 评论 0

项目：nlp-chinese_text_classification 作者: iamiamn 项目源码文件源码

def getChList(docStrByte):
    ## ??????????????????????????????????

    inputStr = str(docStrByte, encoding = 'gbk', errors = 'ignore').lower()#?????????????????
    strList = ''.join(inputStr.split('\n'))#????????????????
    rawTokens = list(jieba.tokenize(strList))#????

    #stopWord ? ???????key ???????value??None
    fSW = open('stopwords.txt', 'r', encoding = 'utf-8', errors = 'ignore').read()
    stopWord = {}.fromkeys(fSW.split('\n'))
    stopWord[''] = None

    final = []
    s = nltk.stem.SnowballStemmer('english')
    for seg in rawTokens:
        # print(seg[0].strip())
        rawWord = seg[0].strip()#strip()?????????????
        if (rawWord.isalpha()):#?????????????
            word = s.stem(rawWord)
        else:
            word = rawWord

        if  word not in stopWord:#?????
            final.append(word)#????list
    return final