def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize, POS-tag, and chunk using regular expressions
chunker = nltk.chunk.regexp.RegexpParser(grammar)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# join constituent chunk words into a single chunked phrase
candidates = [' '.join(word for word, pos, chunk in group).lower()
for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
return [cand for cand in candidates
if cand not in stop_words and not all(char in punct for char in cand)]
python类pos_tag_sents()的实例源码
AKE.py 文件源码
项目:NLP-Keyword-Extraction-Ensemble-Method
作者: Ashwin-Ravi
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
''' This function will extract text of a specific POS sequence rather than just Noun Phrase '''
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize, POS-tag, and chunk using regular expressions
chunker = nltk.chunk.regexp.RegexpParser(grammar)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# join constituent chunk words into a single chunked phrase
candidates = [' '.join(word for word, pos, chunk in group)
for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
return [cand for cand in candidates
if cand not in stop_words and not all(char in punct for char in cand)]
def extract_chunks(text_string,max_words=3,lemmatize=False):
# Any number of adjectives followed by any number of nouns and (optionally) again
# any number of adjectives folowerd by any number of nouns
grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
# Makes chunks using grammar regex
chunker = nltk.RegexpParser(grammar)
# Get grammatical functions of words
# What this is doing: tag(sentence -> words)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string))
# Make chunks from the sentences, using grammar. Output in IOB.
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# Join phrases based on IOB syntax.
candidates = [' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key]
# Filter by maximum keyphrase length
candidates = list(filter(lambda l: len(l.split()) <= 3, candidates))
# Filter phrases consisting of punctuation or stopwords
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
candidates = list(filter(lambda l: l not in stop_words and not all(c in punct for c in l),candidates))
# lemmatize
if lemmatize:
lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
candidates = [lemmatizer(x) for x in candidates]
return candidates
def tag_sentences(sentences, pos_symbol=False):
tokenized = []
for sent in sentences:
tokenized.append(tokenizer(sent))
processed_list = tagger(tokenized)
if not pos_symbol:
output_list = []
for sentence in processed_list:
new_sentence = []
for word in sentence:
new_sentence.append((word[_IDX_WORD], POS_TAGS[word[_IDX_SYMBOL]]))
output_list.append(new_sentence)
else:
output_list = processed_list
return output_list
def tag_many(self, documents, tagset=None, **kwargs):
""" POS-Tag many documents. """
return pos_tag_sents((word_tokenize(d) for d in documents), tagset)
AKE.py 文件源码
项目:NLP-Keyword-Extraction-Ensemble-Method
作者: Ashwin-Ravi
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize and POS-tag words
tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
for sent in nltk.sent_tokenize(text)))
# filter on certain POS tags and lowercase all words
candidates = [word.lower() for word, tag in tagged_words
if tag in good_tags and word.lower() not in stop_words
and not all(char in punct for char in word)]
return candidates
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize and POS-tag words
tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
for sent in nltk.sent_tokenize(text)))
# filter on certain POS tags and lowercase all words
candidates = [word.lower() for word, tag in tagged_words
if tag in good_tags and word.lower() not in stop_words
and not all(char in punct for char in word)]
return candidates
keyphrase_extraction.py 文件源码
项目:text-analytics-with-python
作者: dipanjanS
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def get_chunks(sentences, grammar = r'NP: {<DT>? <JJ>* <NN.*>+}'):
all_chunks = []
chunker = nltk.chunk.regexp.RegexpParser(grammar)
for sentence in sentences:
tagged_sents = nltk.pos_tag_sents(
[nltk.word_tokenize(sentence)])
chunks = [chunker.parse(tagged_sent)
for tagged_sent in tagged_sents]
wtc_sents = [nltk.chunk.tree2conlltags(chunk)
for chunk in chunks]
flattened_chunks = list(
itertools.chain.from_iterable(
wtc_sent for wtc_sent in wtc_sents)
)
valid_chunks_tagged = [(status, [wtc for wtc in chunk])
for status, chunk
in itertools.groupby(flattened_chunks,
lambda (word,pos,chunk): chunk != 'O')]
valid_chunks = [' '.join(word.lower()
for word, tag, chunk
in wtc_group
if word.lower()
not in stopword_list)
for status, wtc_group
in valid_chunks_tagged
if status]
all_chunks.append(valid_chunks)
return all_chunks
def tokenize(str_stream, eos=True, remove_punct=False):
"""
Given a str or str_stream (f.read()) convert the str to a list of sentences,
e.g.: [[word, word], [word, word, ...], ...]
:param str_stream: a str or a str_stream
:param eos: wether turns '.' into <eos> tag
:param remove_punct: wether to remove punctuations: ':', ';', '--', ',', "'"
:return: a list of sentences, each sentence is a list of words (str)
"""
# do lazy import coz import nltk is very slow
import nltk
try:
nltk.data.load('tokenizers/punkt/english.pickle')
except LookupError:
print('punct resource not found, using nltk.download("punkt") to download resource data...')
nltk.download('punkt')
tokens = [nltk.word_tokenize(t) for t in nltk.sent_tokenize(str_stream.lower())]
# get POS Tags
tokens_tags = nltk.pos_tag_sents(tokens, tagset='universal')
pos_tags = []
for token_tags in tokens_tags:
_, tags = zip(*token_tags)
pos_tags.append(tags)
# tag number
tokens = [['N' if isfloat(t) else t for t in sublist] for sublist in tokens]
if eos:
for token in tokens:
token[-1] = '<eos>'
if remove_punct:
tokens = [[t for t in sublist if t not in __punct_set] for sublist in tokens]
return tokens, pos_tags
def generate_candidate(texts, method='word', remove_punctuation=False):
"""
Generate word candidate from given string
Parameters
----------
texts: str, input text string
method: str, method to extract candidate words, either 'word' or 'phrase'
Returns
-------
candidates: list, list of candidate words
"""
words_ = list()
candidates = list()
# tokenize texts to list of sentences of words
sentences = sent_tokenize(texts)
for sentence in sentences:
if remove_punctuation:
sentence = punct_re.sub(' ', sentence) # remove punctuation
words = word_tokenize(sentence)
words = list(map(lambda s: s.lower(), words))
words_.append(words)
tagged_words = pos_tag_sents(words_) # POS tagging
if method == 'word':
tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])
tagged_words = chain.from_iterable(tagged_words)
for word, tag in tagged_words:
if tag in tags and word.lower() not in stop_words:
candidates.append(word)
elif method == 'phrase':
grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
chunker = RegexpParser(grammar)
all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
candidate = ' '.join([word for (word, pos, chunk) in group])
if key is True and candidate not in stop_words:
candidates.append(candidate)
else:
print("Use either 'word' or 'phrase' in method")
return candidates