def combine_pos_tag(self, pos_tag):
noun = ['NN', 'NNS', 'NNP', 'NNPS']
adjective = ['JJ', 'JJR', 'JJS']
adverb = ['RB', 'RBR', 'RBS']
verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
wh = ['WDT', 'WP', 'WRB']
if pos_tag in noun:
return 'NN'
elif pos_tag in adjective:
return 'JJ'
elif pos_tag in adverb:
return 'RB'
elif pos_tag in verb:
return 'VB'
elif pos_tag in wh:
return 'WP'
else:
return pos_tag
python类pos_tag()的实例源码
def branch(words):
"""
This initial filter of our input sentence.
It tokenizes the words and tags the words with parts of speech.
It then passes the tokenized and tagged words to 1 of 3 functions.
A sentence is either declarative() , interrogative() , or imperative()
Args:
words (String): The words inputted by the user
Returns:
String: response from one of the three functions that handle type of sentences.
"""
parts_of_speech = nltk.pos_tag(nltk.word_tokenize(words))
leading_word = parts_of_speech[0][1][0]
if leading_word == 'W':
return interrogative(parts_of_speech[1:])
elif leading_word == "V":
return imperative(parts_of_speech)
else:
declarative(parts_of_speech)
def tokenize(data, language="english", filterStopWords = False, tagging = False):
result = {}
tags = []
filterChars = [",", ".", "?", ";", ":", "'", "!", "@", "#", "$", "%", "&", "*", "(", ")", "+", "{", "}", "[", "]", "\\", "|"]
sent_token = nltk.tokenize.sent_tokenize(data, language)
word_token = nltk.tokenize.word_tokenize(data, language)
word_token = [w for w in word_token if not w in filterChars]
if filterStopWords is True:
stop_words = set(stopwords.words(language))
word_token = [w for w in word_token if not w in stop_words]
if tagging is True:
tags = nltk.pos_tag(word_token)
result = {"sent_token": sent_token, "word_token": word_token, "pos_tag": tags}
return json.loads(jsonpickle.encode(result, unpicklable=False))
def change_sentence(self):
text = nltk.tokenize.word_tokenize(self._sentence)
changed = False
for cur in nltk.pos_tag(text):
if (cur[1] == "NN" or cur[1] == "NNP" or cur[1] == "RPR"):
foundedTmura = self.getFromDB(cur[0])
if foundedTmura == None:
foundedTmura = getTmura(cur[0])
if foundedTmura != "not found":
self.add2DB(cur[0], foundedTmura)
if foundedTmura != "not found" and changed == False:
if (foundedTmura.find("OR")):
foundedTmura = foundedTmura.replace('OR', 'or')
if randrange(2) == 0:
rep = cur[0] + ", " + foundedTmura + ", "
else:
rep = cur[0] + "(" + foundedTmura + ") "
self._sentence = self._sentence.replace(cur[0], rep)
changed = True
return self._sentence
def analysis(reviews_collection_text):
with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
raw_data = f.read()
with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
comments = f.readlines()
data = raw_data.replace('\n', ' ')
data_lower = data.lower()
tokens_with_punc = word_tokenize(data_lower)
tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
print("--- Most frequent tokens ---\n",
FreqDist(tokens_with_punc).most_common(15))
print("--- Tokens without punctuation ---\n",
FreqDist(tokens).most_common(15))
stop = set(stopwords.words('english'))
words = [word for word in tokens if word not in stop]
print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
tagged = pos_tag(words)
nouns = [word for word, pos in tagged if (pos == 'NN')]
print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
adjts = [word for word, pos in tagged if (pos == 'JJ')]
print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
avgld = sum(lxdst) / len(comments)
print("--- Average lexical density ---\n", avgld)
def whereRules(sentenceOriginal):
score = 0
sentence = sentenceOriginal.lower()
# for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))):
# if type(chunk) is nltk.tree.Tree:
# if 'LOCATION' in chunk.label() or 'GPE' in chunk.label():
# score += 10
# RULE 2
for word in LOCPREP:
if word in sentence:
score += 4
# RULE 3
for word in LOCATION:
if word in sentence:
score += 6
return score
# WHEN RULES
def check_imperative(self, paragraph):
"""
Check the given sentence/s for Imperatives.
:param paragraph:
The input paragraph to be tested.
:return:
A list of tuples having 2 elements (invalid word, parts of speech)
or an empty list if no invalid words are found.
"""
words = nltk.word_tokenize(nltk.sent_tokenize(paragraph)[0])
# VBZ : Verb, 3rd person singular present, like 'adds', 'writes'
# etc
# VBD : Verb, Past tense , like 'added', 'wrote' etc
# VBG : Verb, Present participle, like 'adding', 'writing'
word, tag = nltk.pos_tag(['I'] + words)[1:2][0]
if(tag.startswith('VBZ') or
tag.startswith('VBD') or
tag.startswith('VBG') or
word.endswith('ing')): # Handle special case for VBG
return (word, tag)
else:
return None
def word_split(self, sentence):
words = re.split(self.word_split_pattern, sentence)
words = [w for w in words if len(w) > 0]
words = ["::".join(tag) for tag in nltk.pos_tag(words)]
return words
def word_split(self, sentence):
words = re.split(self.word_split_pattern, sentence)
words = [w for w in words if len(w) > 0]
words = ["::".join(tag) for tag in nltk.pos_tag(words)]
return words
def word_split(self, sentence):
words = re.split(self.word_split_pattern, sentence)
words = [w for w in words if len(w) > 0]
words = ["::".join(tag) for tag in nltk.pos_tag(words)]
return words
def _find_nouns(self, sentence):
tokens = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(tokens)
nouns = [word for word, pos in tagged \
if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
filter_keywords = ['chuck', 'norris', 'quot']
filtered = [i for i in nouns if not any(f in i.lower() for f in filter_keywords)]
return filtered
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 39
收藏 0
点赞 0
评论 0
def _count_token_with_match(self, answer, match):
"""Count answer match FLAG
"""
text = nltk.word_tokenize(answer)
post = nltk.pos_tag(text)
count = 0
for k, v in post:
if v == match:
count += 1
return count
def is_noun(word):
POS = nltk.pos_tag([word])[0][1]
return POS.startswith('NN')
def get_tweet_tags(tweet):
""" Break up a tweet into individual word parts """
tknzr = TweetTokenizer()
tokens = tknzr.tokenize(tweet)
# replace handles with real names
for n, tok in enumerate(tokens):
if tok.startswith('@'):
handle = tok.strip("@")
if handle in user.students:
# If we have a database entry for the mentioned user, we can
# easily substitute a full name.
usr = user.NPUser(handle)
tokens[n] = usr.fullname
else:
# If there is no database entry, we use the user's alias. While
# this is the full name in many cases, it is often not reliable
usr = api.get_user(handle)
tokens[n] = usr.name
tagged = nltk.pos_tag(tokens)
# In nltk, if a teacher's name is written with a period after an
# abbreviated prefix, it is awkwardly broken up into 3 tags
for n, tag in enumerate(tagged):
# If there is the weird period after the prefix,
if tag[1] == '.':
# and it is in fact splitting up a person's name,
if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
# combine it into the actual name,
tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
tagged[n + 1][0]), 'NNP')
# and then remove the extra tags.
del tagged[n + 1]
del tagged[n]
return tagged
def normalize_tokens(self):
if len(self.stindices) != len(self.enindices):
sys.stderr.write("\t\tIssue: overlapping tokenization for multiple tokens\n")
return
start = {}
idx = 0
for s in sorted(self.stindices):
self.stindices[s] = idx
start[idx] = s
idx += 1
end = {}
idx = 0
for t in sorted(self.enindices):
self.enindices[t] = idx
end[idx] = t
if idx > 0 and end[idx - 1] > start[idx]:
sys.stderr.write("\t\tIssue: overlapping tokenization of neighboring tokens\n")
return
token = self.text[start[idx] : t + 1].strip()
if " " in token:
sys.stderr.write("\t\tIssue: incorrect tokenization " + token + "\n")
return
if token == "": continue
self.tokens.append(token)
idx += 1
try:
self.nltkpostags = [ele[1] for ele in pos_tag(self.tokens)]
for idx in xrange(len(self.tokens)):
tok = self.tokens[idx]
if self.nltkpostags[idx].startswith("V"):
self.nltklemmas.append(lemmatizer.lemmatize(tok, pos='v'))
else:
self.nltklemmas.append(lemmatizer.lemmatize(tok))
except IndexError:
print self.tokens
print pos_tag(self.tokens)
return True
def tag(self, tokens):
"""
add pos tags to token objects
:param tokens: list of token objects
:type tokens: list(Token)
:return: label augmented list of Token objects
:rtype: list(Token)
"""
tags = pos_tag([token.get_text() for token in tokens])
for token, tag in zip(tokens, tags):
token.add_a_label('pos', tag[1])
return tokens
def pos(text):
tokens = nltk.word_tokenize(text)
wordpos = nltk.pos_tag(tokens)
return wordpos
def __tagPartsOfSpeech(words):
return [pair[1] for pair in nltk.pos_tag(words)]
def tag(text, tt_home):
# Default NLTK's tokenizer
# TreebankWordTokenizer + PunktSentenceTokenizer
nltk_start = time()
tokens = word_tokenize(text)
# Default NLTK's POS tagger
# ?
# Use tagset='universal' for universal tagset
nltk_tagged = pos_tag(tokens)
nltk_end = time()
nltk_execution = nltk_end - nltk_start
logger.info("NLTK took %f seconds" % nltk_execution)
# TreeTagger wrapper
# Tokenization: ?
# Default language: English
# English: trained on Penn treebank
# Default flags: -token -lemma -sgml -quiet -no-unknown
tt_start = time()
tt = TreeTagger(TAGDIR=tt_home)
raw_tags = tt.tag_text(text)
tt_end = time()
tt_execution = tt_end - tt_start
tt_tagged = make_tags(raw_tags)
logger.info("TreeTagger took %f seconds" % tt_execution)
return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
def tag_one(self, text, tagset, **kwargs):
""" POS-Tags the given text """
return pos_tag(word_tokenize(text, tagset))