def ne_tagging(text):
chunked = ne_chunk(pos_tag(word_tokenize(text)))
prev = None
continuous_chunk = []
current_chunk = []
for i in chunked:
if type(i) == Tree:
current_chunk.append(" ".join([token for token, pos in i.leaves()]))
elif current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
return continuous_chunk
python类pos_tag()的实例源码
def keyword_extractor(data):
try:
#np_extractor = NPExtractor(words_wo_stopwords(strip_tags(data)))
#result = np_extractor.extract()
text = words_wo_stopwords(strip_tags(data))
#TODO this is duplicated job, should be improved
words = word_tokenize(strip_tags(text))
taggged = pos_tag(words)
cleaned = filter_insignificant(taggged)
text = " ".join(cleaned)
wc = WordCloudMod().generate(text)
result = list(wc.keys())[:10]
except Exception as err:
print(colored.red("At keywords extraction {}".format(err)))
result = []
return result
# TODO definitely can be better if we knew where content is
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 38
收藏 0
点赞 0
评论 0
def _identify_pronoun(self, answer):
"""Calculate percentage of pronouns within answer
- Args:
answer(str): answer text
- Returns:
percentage(float): ratio of pronouns in answer
"""
text = nltk.word_tokenize(answer)
post = nltk.pos_tag(text)
pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
# init variables
num_pronouns = 0
num_terms = len(post)
percentage = 0
for k, v in post:
if v in pronoun_list:
num_pronouns += 1
percentage = float(num_pronouns) / num_terms
return percentage
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def _identify_pronoun2(self, sentence):
"""Calculate percentage of pronouns in the sentence that are in the answer
- Args:
sentence(str): question sentence
- Returns:
pronoun_in_sentence(list): pronouns in sentence
sentence_len(int): length of current sentence
"""
text = nltk.word_tokenize(sentence)
post = nltk.pos_tag(text)
pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
pronoun_in_sentence = []
sentence_len = len(post)
for k, v in post:
if v in pronoun_list:
pronoun_in_sentence.append(k)
return pronoun_in_sentence, sentence_len
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 36
收藏 0
点赞 0
评论 0
def _first_tagger_after_answer_span(self, question):
"""Get the first tagger after answer span
- Args:
question(string): string of current question
- Returns:
tagger(string): tagger of first term after span
"""
index = 0
text = nltk.word_tokenize(question)
post = nltk.pos_tag(text)
for idx, t in enumerate(post):
if t[0] == '_____':
index = idx + 1
break
try:
return post[index][1]
except IndexError:
return 'dummy'
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def _first_tagger_before_answer_span(self, question):
"""Get the first tagger before answer span
- Args:
question(string): string of current question
- Returns:
tagger(string): tagger of first term before span
"""
index = 0
text = nltk.word_tokenize(question)
post = nltk.pos_tag(text)
for idx, t in enumerate(post):
if t[0] == "_____":
index = idx - 1
break
try:
return post[index][1]
except IndexError:
return 'dummy'
def tag(self, lines):
'''
Tokenize and categorise the words in the collection of
text
:param lines: The list of strings with the text to match
:type lines: ``list`` of ``str``
:rtype: :class:
:return:
'''
try:
tokenized_words = nltk.word_tokenize(lines)
return nltk.pos_tag(tokenized_words)
except LookupError as le:
print("Run install_words.py first")
raise le
def brown_data():
"""return the text_length first tokens of the brown corpus tagged in pyrata format"""
tokens = brown.words()
tokens = tokens[:text_length]
pos_tags = nltk.pos_tag(tokens)
return [{'raw':w, 'pos':p} for (w, p) in pos_tags]
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# TEST
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
def tag_one(self, text, skip_unknown=True, **kwargs):
""" POS-Tags the given text, optionally skipping unknown lemmas
:param unicode text: Text to be tagged
:param bool skip_unknown: Automatically emove unrecognized tags from the result
Sample usage:
>>> from strephit.commons.pos_tag import TTPosTagger
>>> from pprint import pprint
>>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj'))
[Tag(word=u'sample', pos=u'NN', lemma=u'sample'),
Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'),
Tag(word=u'to', pos=u'TO', lemma=u'to'),
Tag(word=u'be', pos=u'VB', lemma=u'be'),
Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')]
"""
return self._postprocess_tags(make_tags(self.tagger.tag_text(text, **kwargs)),
skip_unknown)
def _get_base_doge_words(self, eng_text):
"""
Get all base words from text to make doge phrases from.
eg. 'Hello there, I am happy' -> ['hello', 'are', 'happy']
Args:
eng_text (str): Text to get words from.
Returns:
list[str]: List of lower case words to use from text.
"""
phrase_no_punct = "".join([ch for ch in eng_text if ch not in string.punctuation])
tagged_words = nltk.pos_tag([w.lower() for w in phrase_no_punct.split(' ') if w.isalpha()])
chosen_words = []
for word, tag in tagged_words:
if tag[0] in ['N', 'V', 'J']:
# make noun singular
if tag[0] == 'N':
word = self._lemmatizer.lemmatize(word, pos='n')
# make verb infinitive
elif tag[0] == 'V':
word = self._lemmatizer.lemmatize(word, pos='v')
chosen_words.append(word.encode('ascii', 'ignore')) # lemmatize makes word unicode
return list(set(chosen_words))
def _get_doge_descriptors(self, word_ls):
"""
Get descriptors for a set of doge words.
eg. ['person', 'run'] -> ['much', 'very']
Args:
word_ls (list[str]): List of words to use.
Returns:
list[str]: List of doge descriptors, eg. 'much', 'very', in order.
"""
tagged_words = nltk.pos_tag(word_ls)
chosen_descriptors = []
for word, tag in tagged_words:
possible_descs = [MUCH, MANY, SUCH, SO, VERY]
if tag[0] == 'J':
possible_descs.remove(VERY)
possible_descs.remove(SO)
if len(chosen_descriptors) >= 2:
allowed_descriptors = [s for s in possible_descs if s not in chosen_descriptors[-2:]]
else:
allowed_descriptors = [s for s in possible_descs if s not in chosen_descriptors]
chosen_descriptors.append(random.choice(allowed_descriptors))
return chosen_descriptors
def extract_candidate_words(sents, tags=GOODTAGS, tagged=False, **kwargs):
"""
Extracts key words based on a list of good part of speech tags.
If the sentences are already tokenized and tagged, pass in: tagged=True
"""
normalizer = Normalizer(**kwargs)
for sent in sents:
# Tokenize and tag sentences if necessary
if not tagged:
sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))
# Identify only good words by their tag
for token, tag in sent:
if tag in tags:
for token in normalizer.normalize([token]):
yield token
##########################################################################
## Key phrase by text scoring mechanisms
##########################################################################
def normalize(self, words):
"""
Normalizes a list of words.
"""
# Add part of speech tags to the words
words = nltk.pos_tag(words)
for word, tag in words:
if self.lower: word = word.lower()
if self.strip: word = word.strip()
if word not in self.stopwords:
if not all(c in self.punct for c in word):
if self.lemmatize:
word = self.lemmatizer.lemmatize(word, tag)
yield word
def tagged_abstracts(RS_pmids_tokenizedabstracts_dict):
""" Takes a dict of tokenized abstracts
and tags them using the NLTK module for Natural Language Entities.
Input dictionary: key is the RS ID, value is a dictionary where key is the pmid and value is a list of tokens"""
RS_pmids_taggedabstracts_dict = {}
for each_RS in RS_pmids_tokenizedabstracts_dict:
pmids_taggedabstracts = {}
pmids_tokenizedabstracts = RS_pmids_tokenizedabstracts_dict[each_RS]
for pmid in pmids_tokenizedabstracts:
taggedabstracts_list = []
for token in pmids_tokenizedabstracts[pmid]:
tagged = nltk.pos_tag(token)
taggedabstracts_list.append(tagged)
pmids_taggedabstracts[pmid] = taggedabstracts_list
RS_pmids_taggedabstracts_dict[each_RS] = pmids_taggedabstracts
return RS_pmids_taggedabstracts_dict
def from_sentence(sent):
tokens = nltk.word_tokenize(sent)
tagged = nltk.pos_tag(tokens)
dg = DependencyGraph()
for (index, (word, tag)) in enumerate(tagged):
dg.nodes[index + 1] = {
'word': word,
'lemma': '_',
'ctag': tag,
'tag': tag,
'feats': '_',
'rel': '_',
'deps': defaultdict(),
'head': '_',
'address': index + 1,
}
dg.connect_graph()
return dg
def prepare_sentence(words,
vectorizer=None,
lemmatizer=None,
max_words=78,
return_output=True):
X = np.ones((max_words, 300))*ZERO_EPSILON
if return_output:
y = np.ones((max_words, 300))*ZERO_EPSILON
raw_pos = [p[1]for p in pos_tag(words)]
pos = [str(treebank_to_simple(p, default=wordnet.NOUN)) for p in raw_pos]
lemmas = [str(lemmatizer(w, pos=p)) for (w,p) in zip(words, pos)]
num_words = len(words) if len(words) <= max_words else max_words
for word_i in range(num_words):
word_vector = vectorizer(words[word_i])
X[word_i, :] = word_vector
if return_output:
lemma_vector = lemmas[word_i]
y[word_i, :] = vectorizer(lemma_vector)
if return_output:
return X, y
return X
def extract_candidate_phrases(sents, grammar=GRAMMAR, tagged=False):
# Create the chunker that uses our grammar
chunker = RegexpParser(grammar)
for sent in sents:
# Tokenize and tag sentences if necessary
if not tagged:
sent = nltk.pos_tag(nltk.word_tokenize(sent))
# Parse the sentence, converting the parse tree into a tagged sequence
sent = normalize(sent)
if not sent: continue
chunks = tree2conlltags(chunker.parse(sent))
# Extract phrases and rejoin them with space
phrases = [
" ".join(word for word, pos, chunk in group).lower()
for key, group in groupby(
chunks, lambda term: term[-1] != 'O'
) if key
]
for phrase in phrases:
yield phrase
def get_counts():
global unigrams
global bigrams
global sentences
for i in xrange(1, NUM_FILES+1):
if i in SKIP:
continue
with open("Shakespeare_parsed/%03d" % i) as f:
for line in f:
tokens = get_tokens(line)
tokens = [t.lower() for t in tokens]
tags = nltk.pos_tag(tokens)
if len(tokens) == 0:
continue
sentences.append(tokens)
prev_word = ""
for token in tokens:
unigrams[token] += 1
if not prev_word == "":
bigrams[(prev_word,token)] += 1
prev_word = token
top10_uni = unigrams.most_common()[:10]
top10_bi = bigrams.most_common()[:10]
def tag_contexts(doc_id):
global tags
if not tags :
tags = nltk.data.load("help/tagsets/upenn_tagset.pickle")
words = defaultdict(Counter)
count = Counter()
for context in get_contexts(doc_id) :
for word, tag in nltk.pos_tag(tokenize(context)) :
words[tag].update([word])
count.update([tag])
tag_common_words = {tag : ' '.join(zip(*tag_words.most_common(10))[0]) for tag, tag_words in words.items() }
for tag, freq in count.most_common(15) :
print "%4d\t%45s\t%s" % (freq, tags[tag][0], tag_common_words[tag])
reddit_NN_entities.py 文件源码
项目:Hanhan_Play_With_Social_Media
作者: hanhanwu
项目源码
文件源码
阅读 35
收藏 0
点赞 0
评论 0
def get_NN_entities(post):
sentences = nltk.tokenize.sent_tokenize(post)
token_sets = [nltk.tokenize.word_tokenize(s) for s in sentences]
pos_tagged_token_sets = [nltk.pos_tag(t) for t in token_sets]
pos_tagged_tokens = [t for v in pos_tagged_token_sets for t in v]
all_entities = []
previous_pos = None
current_entities = []
for (entity, pos) in pos_tagged_tokens:
if previous_pos == pos and pos.startswith('NN'):
current_entities.append(entity.lower())
elif pos.startswith('NN'):
if current_entities != []:
all_entities.append(' '.join(current_entities))
current_entities = [entity.lower()]
previous_pos = pos
return all_entities
def tokenize(self, document):
# Break the document into sentences
for sent in sent_tokenize(document):
# Break the sentence into part of speech tagged tokens
for token, tag in pos_tag(wordpunct_tokenize(sent)):
# Apply preprocessing to the token
token = token.lower() if self.lower else token
token = token.strip() if self.strip else token
token = token.strip('_') if self.strip else token
token = token.strip('*') if self.strip else token
# If stopword, ignore token and continue
# if token in self.stopwords:
# continue
# If punctuation, ignore token and continue
if all(char in self.punct for char in token):
continue
# Lemmatize the token and yield
lemma = self.lemmatize(token, tag)
yield lemma
def clean_text(raw_text, filtered_word_types):
"""Clean raw text for bag-of-words model"""
# Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", raw_text)
# Convert to lower case, split into individual words
words = letters_only.lower().split()
# stem words
stemmer = PorterStemmer()
stemmed_words = list(map(stemmer.stem, words))
# Remove stop words if requested
if filtered_word_types is not None:
tagged_text = nltk.pos_tag(stemmed_words)
stemmed_words = [w for w, wtype in tagged_text if not wtype in filtered_word_types]
# join together
return " ".join(stemmed_words)
def get_lemmas(sent, lemmatizer):
stop_words = []
res = []
for word in sent:
pos = get_wordnet_pos(nltk.pos_tag([word])[0][1])
if pos == '':
lemma = lemmatizer.lemmatize(word)
else:
lemma = lemmatizer.lemmatize(word, pos)
#if(type(lemma) == unicode):
# lemma = lemma.encode('ascii', 'ignore')
if lemma.isdigit():
res.append('number')
else:
res.append(lemma)
return res
def pos_tag_questions(qstn_list):
res = []
count = 0
for i in qstn_list:
r = []
i = i.split(':')
r.append(i[0])
r.append(i[1].split()[0])
i = i[1].split()
del i[0]
sent = nltk.word_tokenize(' '.join(i))
r.append(nltk.pos_tag(sent))
res.append(tuple(r))
count += 1
if (count % 100) == 0:
print ("processed : " + str(count) )
return res
#experiment with different features to get better accuracy
#also dont forget to to include the same feature extractor in the process_grammar.py
def __init__(self):
super(RssSkill, self).__init__('RssSkill')
self._is_reading_headlines = False
self.feeds = {}
self.cached_items = {}
self.cache_time = {}
try:
pos_tag('advance')
except LookupError:
logger.debug('Tagger not installed... Trying to download')
dler = Downloader()
if not dler.download('averaged_perceptron_tagger'):
logger.debug('Trying alternative source...')
dler = Downloader(ALT_NLTK_DATA)
dler.download('averaged_perceptron_tagger',
raise_on_error=True)
generate_stem_pos_tag.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def pos_tag_text(line,
token_pattern=token_pattern,
exclude_stopword=stopwords,
encode_digit=False):
token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
for name in ["question1", "question2"]:
l = line[name]
## tokenize
tokens = [x.lower() for x in token_pattern.findall(l)]
## stem
#tokens=l.lower().split()
#print tokens
tokens = stem_tokens(tokens, english_stemmer)
line[name+'_stem']=' '.join(tokens)
#print tokens
if exclude_stopword:
tokens = [x for x in tokens if x not in stopwords]
tags = pos_tag(tokens)
tags_list = [t for w,t in tags]
tags_str = " ".join(tags_list)
#print tags_str
line[name+'_pos_tag'] = tags_str
return line[[ u'question1_stem', u'question1_pos_tag', u'question2_stem',
u'question2_pos_tag']]
generate_neighbor_pos.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def get_pos_tag(qind):
q = index_q[qind]
wl = str(q).lower().split()
pos_l = nltk.pos_tag(wl)
q1_pos = []
for pos in pos_l:
q1_pos.append(pos[1])
return q1_pos
# def get_ner_tag(qind):
# q = index_q[qind]
# wl = str(q).lower().split()
# ner_l = nltk.ne_chunk(wl)
# q1_ner = []
# for pos in ner_l:
# q1_ner.append(pos[1])
# return q1_ner
generate_ngram_pos_link.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 37
收藏 0
点赞 0
评论 0
def getPOSLinks(text):
wordnet_lemmatizer = WordNetLemmatizer()
text = nltk.word_tokenize(text)
pos = nltk.pos_tag(text)
links = []
link = []
active = False
for w in pos:
part = w[1]
word = w[0]
if(not active and (part[:2] == "DT" or part == "WP" or part == "VB" or part == "IN")):
active = True
if(active):
link.append(wordnet_lemmatizer.lemmatize(word))
#extract main body
if(active and (part == "PRP" or part[:2] == "NN" or part == "." )):
active = False
links.append(" ".join(link))
link = []
return links
def tag(path, filename):
print("Tagging "+path)
WRITE_HANDLER = open(PREPROCESSED_DATA + filename.strip() + "_features", 'w')
for line in open(path, 'r'):
tokens = line.split()
if(len(tokens) == 0):
continue
tags = pos_tag(tokens) # tag
features = list()
for token in tags:
tok = token[0]
tag = token[1]
if tok.lower() not in stop_words:
features.append(tok+":"+tag)
if(len(features)>0):
WRITE_HANDLER.write(str(features)+'\n\n')
else: ## EMPTY lines
WRITE_HANDLER.write('\n\n')
def _analyze_query(self):
tagged = nltk.pos_tag(self.ir_query)
ir_query_tagged = []
for word, pos in tagged:
pos = {
pos.startswith('N'): wordnet.NOUN,
pos.startswith('V'): wordnet.VERB,
pos.startswith('J'): wordnet.ADJ,
pos.startswith('R'): wordnet.ADV,
}.get(pos, None)
if pos:
synsets = wordnet.synsets(word, pos=pos)
else:
synsets = wordnet.synsets(word)
ir_query_tagged.append((word, synsets))
# Add additional special hidden term
ir_query_tagged.append(('cause', [wordnet.synset('cause.v.01')]))
self.ir_query_tagged = ir_query_tagged