def generate_candidate(texts, method='word', remove_punctuation=False):
"""
Generate word candidate from given string
Parameters
----------
texts: str, input text string
method: str, method to extract candidate words, either 'word' or 'phrase'
Returns
-------
candidates: list, list of candidate words
"""
words_ = list()
candidates = list()
# tokenize texts to list of sentences of words
sentences = sent_tokenize(texts)
for sentence in sentences:
if remove_punctuation:
sentence = punct_re.sub(' ', sentence) # remove punctuation
words = word_tokenize(sentence)
words = list(map(lambda s: s.lower(), words))
words_.append(words)
tagged_words = pos_tag_sents(words_) # POS tagging
if method == 'word':
tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])
tagged_words = chain.from_iterable(tagged_words)
for word, tag in tagged_words:
if tag in tags and word.lower() not in stop_words:
candidates.append(word)
elif method == 'phrase':
grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
chunker = RegexpParser(grammar)
all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
candidate = ' '.join([word for (word, pos, chunk) in group])
if key is True and candidate not in stop_words:
candidates.append(candidate)
else:
print("Use either 'word' or 'phrase' in method")
return candidates
keyphrase_extraction.py 文件源码
python
阅读 22
收藏 0
点赞 0
评论 0
评论列表
文章目录