def extract_NNs(chunk, pos):
"""
Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'],
and pos [(word, pos)], e.g., [('man', 'NN')]
we extract from NPs with stopping, location, color, size words filtered out,
and return list of NN words only.
"""
forbid_wds = stop_words + location_words + color_words + size_words
NNs = []
for phrase, ptype in chunk:
if ptype == 'NP':
filtered_wds = []
for wd in phrase.split():
wd_pos = [p[1] for p in pos if p[0] == wd][0]
if wd not in forbid_wds and wd_pos != 'JJ' and wd_pos != 'CD': # we don't need JJ nor CD words neither.
filtered_wds += [wd]
if len(' '.join(filtered_wds)) > 0:
NNs += [' '.join(filtered_wds)]
return NNs
评论列表
文章目录