def pos_tag_text(line,
token_pattern=token_pattern,
exclude_stopword=stopwords,
encode_digit=False):
token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
for name in ["question1", "question2"]:
l = line[name]
## tokenize
tokens = [x.lower() for x in token_pattern.findall(l)]
## stem
#tokens=l.lower().split()
#print tokens
tokens = stem_tokens(tokens, english_stemmer)
line[name+'_stem']=' '.join(tokens)
#print tokens
if exclude_stopword:
tokens = [x for x in tokens if x not in stopwords]
tags = pos_tag(tokens)
tags_list = [t for w,t in tags]
tags_str = " ".join(tags_list)
#print tags_str
line[name+'_pos_tag'] = tags_str
return line[[ u'question1_stem', u'question1_pos_tag', u'question2_stem',
u'question2_pos_tag']]
generate_stem_pos_tag.py 文件源码
python
阅读 24
收藏 0
点赞 0
评论 0
评论列表
文章目录