def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
"with tag '%s', and rel '%s'" %
(node['word'], node['tag'], node['rel'])
)
return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
python类tag()的实例源码
glue.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
"with tag '%s', and rel '%s'" %
(node['word'], node['tag'], node['rel'])
)
return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
"with tag '%s', and rel '%s'" %
(node['word'], node['tag'], node['rel'])
)
return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 38
收藏 0
点赞 0
评论 0
def _pos1_gram_after_answer(self, row, flag):
"""The first POS tag following the answer span is [FLAG]
- Args:
row(pandas.dataframe): input pandas dataframe
flag(string): symbol to match first tagger
- Returns:
binary(int): 1 match, 0 not match
"""
question = row.Question
if question:
first_tagger = self._first_tagger_after_answer_span(question)
if first_tagger == flag:
return 1
else:
return 0
else:
return 0
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def _pos1_gram_before_answer(self, row, flag):
"""The first POS tag before the answer span is [FLAG]
- Args:
row(pandas.dataframe): input pandas dataframe
flag(string): symbol to match first tagger
- Returns:
binary(int): 1 match, 0 not match
"""
question = row.Question
if question:
first_tagger = self._first_tagger_before_answer_span(question)
if first_tagger == flag:
return 1
else:
return 0
else:
return 0
def get_semtypes(self, node):
"""
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
rel = node['rel'].lower()
word = node['word'].lower()
if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
return [SPEC_SEMTYPES['default']]
elif rel in ['nmod', 'vmod']:
return [node['tag'], rel]
else:
return [node['tag']]
glue.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def get_semtypes(self, node):
"""
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
rel = node['rel'].lower()
word = node['word'].lower()
if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
return [SPEC_SEMTYPES['default']]
elif rel in ['nmod', 'vmod']:
return [node['tag'], rel]
else:
return [node['tag']]
def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
"with tag '%s', and rel '%s'" %
(node['word'], node['tag'], node['rel'])
)
return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
def get_semtypes(self, node):
"""
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
rel = node['rel'].lower()
word = node['word'].lower()
if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
return [SPEC_SEMTYPES['default']]
elif rel in ['nmod', 'vmod']:
return [node['tag'], rel]
else:
return [node['tag']]
def get_semtypes(self, node):
"""
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
rel = node['rel'].lower()
word = node['word'].lower()
if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
return [SPEC_SEMTYPES['default']]
elif rel in ['nmod', 'vmod']:
return [node['tag'], rel]
else:
return [node['tag']]
def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
"with tag '%s', and rel '%s'" %
(node['word'], node['tag'], node['rel'])
)
return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
def get_semtypes(self, node):
"""
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
rel = node['rel'].lower()
word = node['word'].lower()
if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
return [SPEC_SEMTYPES['default']]
elif rel in ['nmod', 'vmod']:
return [node['tag'], rel]
else:
return [node['tag']]
def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
"with tag '%s', and rel '%s'" %
(node['word'], node['tag'], node['rel'])
)
return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
def get_semtypes(self, node):
"""
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
rel = node['rel'].lower()
word = node['word'].lower()
if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
return [SPEC_SEMTYPES['default']]
elif rel in ['nmod', 'vmod']:
return [node['tag'], rel]
else:
return [node['tag']]
def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
"with tag '%s', and rel '%s'" %
(node['word'], node['tag'], node['rel'])
)
return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
def get_semtypes(self, node):
"""
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
rel = node['rel'].lower()
word = node['word'].lower()
if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
return [SPEC_SEMTYPES['default']]
elif rel in ['nmod', 'vmod']:
return [node['tag'], rel]
else:
return [node['tag']]
def get_semtypes(self, node):
"""
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
rel = node['rel'].lower()
word = node['word'].lower()
if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
return [SPEC_SEMTYPES['default']]
elif rel in ['nmod', 'vmod']:
return [node['tag'], rel]
else:
return [node['tag']]
def extract_JK(pos_seq):
"""The 'JK' method in Handler et al. 2016.
Returns token positions of valid ngrams."""
def find_ngrams(input_list, num_):
'''get ngrams of len n from input list'''
return zip(*[input_list[i:] for i in range(num_)])
# copied from M and S chp 5'''
patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
pos_seq = [tag2coarse.get(tag,'O') for tag in pos_seq]
pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]
def stringify(s):
return "".join(a[1] for a in s)
def positionify(s):
return tuple(a[0] for a in s)
ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
return [set(positionify(n)) for n in ngrams]
########
def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
def get_tweet_tags(tweet):
""" Break up a tweet into individual word parts """
tknzr = TweetTokenizer()
tokens = tknzr.tokenize(tweet)
# replace handles with real names
for n, tok in enumerate(tokens):
if tok.startswith('@'):
handle = tok.strip("@")
if handle in user.students:
# If we have a database entry for the mentioned user, we can
# easily substitute a full name.
usr = user.NPUser(handle)
tokens[n] = usr.fullname
else:
# If there is no database entry, we use the user's alias. While
# this is the full name in many cases, it is often not reliable
usr = api.get_user(handle)
tokens[n] = usr.name
tagged = nltk.pos_tag(tokens)
# In nltk, if a teacher's name is written with a period after an
# abbreviated prefix, it is awkwardly broken up into 3 tags
for n, tag in enumerate(tagged):
# If there is the weird period after the prefix,
if tag[1] == '.':
# and it is in fact splitting up a person's name,
if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
# combine it into the actual name,
tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
tagged[n + 1][0]), 'NNP')
# and then remove the extra tags.
del tagged[n + 1]
del tagged[n]
return tagged
def load_xml(self, xmldir):
'''
for KDD/WWW/UMD only
:return: doclist
'''
for filename in os.listdir(xmldir):
with open(xmldir+filename) as textfile:
doc = Document()
doc.name = filename[:filename.find('.xml')]
import string
printable = set(string.printable)
# print((filename))
try:
lines = textfile.readlines()
xml = ''.join([filter(lambda x: x in printable, l) for l in lines])
root = ET.fromstring(xml)
doc.title = root.findall("title")[0].text
doc.abstract = root.findall("abstract")[0].text
doc.phrases = [n.text for n in root.findall("*/tag")]
self.doclist.append(doc)
except UnicodeDecodeError:
print('UnicodeDecodeError detected! %s' % filename )
def get_postag_with_record(records, pairs):
path = os.path.dirname(__file__)
path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
print(path)
# jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
jar = path + '/stanford-postagger.jar'
model = path + '/models/english-bidirectional-distsim.tagger'
pos_tagger = StanfordPOSTagger(model, jar)
# model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
# model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'
stanford_dir = jar.rpartition('/')[0]
stanford_jars = find_jars_within_path(stanford_dir)
pos_tagger._stanford_jar = ':'.join(stanford_jars)
tagged_source = []
# Predict on testing data
for idx, (record, pair) in enumerate(zip(records, pairs)): # len(test_data_plain)
print('*' * 100)
print('File: ' + record['name'])
print('Input: ' + str(pair[0]))
text = pos_tagger.tag(pair[0])
print('[%d/%d][%d] : %s' % (idx, len(records) , len(pair[0]), str(text)))
tagged_source.append(text)
return tagged_source
def get_postag_with_index(sources, idx2word, word2idx):
path = os.path.dirname(__file__)
path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
print(path)
# jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
jar = path + '/stanford-postagger.jar'
model = path + '/models/english-bidirectional-distsim.tagger'
pos_tagger = StanfordPOSTagger(model, jar)
# model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
# model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'
stanford_dir = jar.rpartition('/')[0]
stanford_jars = find_jars_within_path(stanford_dir)
pos_tagger._stanford_jar = ':'.join(stanford_jars)
tagged_source = []
# Predict on testing data
for idx in xrange(len(sources)): # len(test_data_plain)
test_s_o = sources[idx]
source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
text = pos_tagger.tag(source_text)
print('[%d/%d] : %s' % (idx, len(sources), str(text)))
tagged_source.append(text)
return tagged_source
chunkers.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def conll_tag_chunks(chunk_sents):
'''Convert each chunked sentence to list of (tag, chunk_tag) tuples,
so the final result is a list of lists of (tag, chunk_tag) tuples.
>>> from nltk.tree import Tree
>>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])
>>> conll_tag_chunks([t])
[[('DT', 'B-NP'), ('NN', 'I-NP')]]
'''
tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
chunkers.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def parse(self, tagged_sent):
'''Parsed tagged tokens into parse Tree of chunks'''
if not tagged_sent: return None
(words, tags) = zip(*tagged_sent)
chunks = self.tagger.tag(tags)
# create conll str for tree parsing
wtc = zip(words, chunks)
return conlltags2tree([(w,t,c) for (w,(t,c)) in wtc])
chunkers.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 45
收藏 0
点赞 0
评论 0
def parse(self, tagged_sent):
if not tagged_sent: return None
chunks = self.tagger.tag(tagged_sent)
return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
chunkers.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def parse(self, tagged_sent):
iobs = []
in_person = False
for word, tag in tagged_sent:
if word in self.name_set and in_person:
iobs.append((word, tag, 'I-PERSON'))
elif word in self.name_set:
iobs.append((word, tag, 'B-PERSON'))
in_person = True
else:
iobs.append((word, tag, 'O'))
in_person = False
return conlltags2tree(iobs)
chunkers.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def iob_locations(self, tagged_sent):
i = 0
l = len(tagged_sent)
inside = False
while i < l:
word, tag = tagged_sent[i]
j = i + 1
k = j + self.lookahead
nextwords, nexttags = [], []
loc = False
# lookahead in the sentence to find multi-word locations
while j < k:
if ' '.join([word] + nextwords) in self.locations:
# combine multiple separate locations into single location chunk
if inside:
yield word, tag, 'I-LOCATION'
else:
yield word, tag, 'B-LOCATION'
# every next word is inside the location chunk
for nword, ntag in zip(nextwords, nexttags):
yield nword, ntag, 'I-LOCATION'
# found a location, so we're inside a chunk
loc, inside = True, True
# move forward to the next word since the current words
# are already chunked
i = j
break
if j < l:
nextword, nexttag = tagged_sent[j]
nextwords.append(nextword)
nexttags.append(nexttag)
j += 1
else:
break
# if no location found, then we're outside the location chunk
if not loc:
inside = False
i += 1
yield word, tag, 'O'
chunkers.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def ieer_chunked_sents(tag=nltk.tag.pos_tag):
for doc in ieer.parsed_docs():
tagged = ieertree2conlltags(doc.text, tag)
yield conlltags2tree(tagged)
def _get_wordnet_pos(spacy_token):
'''Wordnet POS tag'''
pos = spacy_token.tag_[0].lower()
if pos in ['a', 'n', 'v']:
return pos