lemmatiser.py 文件源码-python代码片段

lemmatiser.py 文件源码

python

阅读 32 收藏 0 点赞 0 评论 0

项目：LDA-REST 作者: valentinarho 项目源码文件源码

def LemNormalizeIt(text):

    # convert non ascii characters
    text = text.encode('ascii', 'replace').decode()
    # remove punctuation and digits
    remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
    transformed = text.lower().translate(remove_punct_and_digits)

    # tokenize the transformed string
    tokenized = nltk.word_tokenize(transformed)

    # apply lemming with morph it
    morph_it = load_morph_it()
    tokenized = [morph_it.get(w, w) for w in tokenized if len(w) > 3]

    return tokenized