generate_stem_pos_tag.py 文件源码-python代码片段

generate_stem_pos_tag.py 文件源码

python

阅读 31 收藏 0 点赞 0 评论 0

项目：kaggle-quora-solution-8th 作者: qqgeogor 项目源码文件源码

def pos_tag_text(line,
                 token_pattern=token_pattern,
                 exclude_stopword=stopwords,
                 encode_digit=False):
    token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
    for name in ["question1", "question2"]:
        l = line[name]
        ## tokenize
        tokens = [x.lower() for x in token_pattern.findall(l)]
        ## stem
        #tokens=l.lower().split()
        #print tokens
        tokens = stem_tokens(tokens, english_stemmer)
        line[name+'_stem']=' '.join(tokens)
        #print tokens
        if exclude_stopword:
            tokens = [x for x in tokens if x not in stopwords]
        tags = pos_tag(tokens)
        tags_list = [t for w,t in tags]
        tags_str = " ".join(tags_list)
        #print tags_str
        line[name+'_pos_tag'] = tags_str
    return line[[ u'question1_stem', u'question1_pos_tag', u'question2_stem',
       u'question2_pos_tag']]