def get_parse_info(parsestr, stemmer, language, stoplist):
hash_token_pos = OrderedDict()
if language=='german':
grammar = r"""
NBAR:
{<N.*|ADJ.*>*<N.*>} # Nouns and Adjectives, terminated with Nouns
VP:
{<V.*>} # terminated with Verbs
NP:
{<NBAR>}
{<NBAR><APPR><NBAR>} # Above, connected with in/of/etc...
"""
if language=='english':
#Taken from Su Nam Kim Paper...
grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
VP:
{<V.*>} # terminated with Verbs
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
chunker = RegexpParser(grammar)
postoks = []
for i in Tree.fromstring(parsestr).subtrees():
if i.height() == 2:
word, pos = i[0], i.label()
hash_token_pos[stemmer.stem(word)] = word + u"::" + pos
postoks.append((word, pos))
chunk_tree = chunker.parse(postoks)
phrases = get_terms(chunk_tree, stemmer, stoplist)
phrase_list = [ ' '.join(term) for term in phrases if term]
return hash_token_pos, phrase_list
data_helpers.py 文件源码
python
阅读 44
收藏 0
点赞 0
评论 0
评论列表
文章目录