def _synonym_prefilter_fn(token, synonym):
'''
Similarity heuristics go here
'''
if (len(synonym.text.split()) > 2) or \
(synonym.lemma == token.lemma) or \
(synonym.tag != token.tag) or \
(token.text.lower() == 'be'):
return False
else:
return True
python类tag()的实例源码
def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
:param untag: if ``True``, omit the tag from tagged input strings.
:type lst: list
:rtype: str
"""
try:
return sep.join(lst)
except TypeError:
if untag:
return sep.join(tup[0] for tup in lst)
from nltk.tag import tuple2str
return sep.join(tuple2str(tup) for tup in lst)
def map_words(self, _text):
mapping = defaultdict(list)
tagged_words = pos_tag(set(self.get_words(_text)))
for word, tag in tagged_words:
mapping[tag].append(word)
return mapping
relextract.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 41
收藏 0
点赞 0
评论 0
def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
:param untag: if ``True``, omit the tag from tagged input strings.
:type lst: list
:rtype: str
"""
try:
return sep.join(lst)
except TypeError:
if untag:
return sep.join(tup[0] for tup in lst)
from nltk.tag import tuple2str
return sep.join(tuple2str(tup) for tup in lst)
def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
:param untag: if ``True``, omit the tag from tagged input strings.
:type lst: list
:rtype: str
"""
try:
return sep.join(lst)
except TypeError:
if untag:
return sep.join(tup[0] for tup in lst)
from nltk.tag import tuple2str
return sep.join(tuple2str(tup) for tup in lst)
def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
:param untag: if ``True``, omit the tag from tagged input strings.
:type lst: list
:rtype: str
"""
try:
return sep.join(lst)
except TypeError:
if untag:
return sep.join(tup[0] for tup in lst)
from nltk.tag import tuple2str
return sep.join(tuple2str(tup) for tup in lst)
def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
:param untag: if ``True``, omit the tag from tagged input strings.
:type lst: list
:rtype: str
"""
try:
return sep.join(lst)
except TypeError:
if untag:
return sep.join(tup[0] for tup in lst)
from nltk.tag import tuple2str
return sep.join(tuple2str(tup) for tup in lst)
def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
:param untag: if ``True``, omit the tag from tagged input strings.
:type lst: list
:rtype: str
"""
try:
return sep.join(lst)
except TypeError:
if untag:
return sep.join(tup[0] for tup in lst)
from nltk.tag import tuple2str
return sep.join(tuple2str(tup) for tup in lst)
def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
:param untag: if ``True``, omit the tag from tagged input strings.
:type lst: list
:rtype: str
"""
try:
return sep.join(lst)
except TypeError:
if untag:
return sep.join(tup[0] for tup in lst)
from nltk.tag import tuple2str
return sep.join(tuple2str(tup) for tup in lst)
def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
:param untag: if ``True``, omit the tag from tagged input strings.
:type lst: list
:rtype: str
"""
try:
return sep.join(lst)
except TypeError:
if untag:
return sep.join(tup[0] for tup in lst)
from nltk.tag import tuple2str
return sep.join(tuple2str(tup) for tup in lst)
def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
"with tag '%s', and rel '%s'" %
(node['word'], node['tag'], node['rel'])
)
return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
rer_build_history.py 文件源码
项目:Intelligent-Phone-Salesman
作者: ShruthiChari
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def postagger(sent):
text = nltk.word_tokenize(sent)
posTagged = pos_tag(text)
#simplifiedTags = [map_tag('en-ptb', 'universal', tag) for word, tag in posTagged]
return posTagged
def tag(text):
"""
?????????? ''list'' ?????????? ''list'' ???? [('???????', '??????')]"""
tagger = nltk.tag.UnigramTagger(model=data())# backoff=default_tagger)
return tagger.tag(text)
def logmsg(s):
# would be better to use python logger
print>>sys.stderr, "[phrasemachine] %s" % s
############## SimpleNP
## Uses a five-tag coarse grammar.
## tagset: A D P N O
# Requires conversion from PTB or Petrov/Gimpel tags to our system.
# "Coarse*" indicates petrov/gimpel
# Grammar change from the FST version: can't repeat NUM in both adj and noun.
def coarse_tag_str(pos_seq):
"""Convert POS sequence to our coarse system, formatted as a string."""
global tag2coarse
tags = [tag2coarse.get(tag,'O') for tag in pos_seq]
return ''.join(tags)
# POS extraction assuming list of POS tags as input.
# >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 4)]
# >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
def tag_text(self, text):
'''take input text and return tokens w/ part of speech tags using NLTK'''
# putting import here instead of top of file b.c. not all will have nltk installed
sents = self.sent_detector.tokenize(text) # TODO: this will fail on some unicode chars. I think assumes ascii
word_pos_pairs = []
all_tokens = []
for sent in sents:
tokens = self.tokenize(sent)
all_tokens = all_tokens + tokens
word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
return {'tokens': all_tokens, 'pos': [tag for (w,tag) in word_pos_pairs]}
feature_construction.py 文件源码
项目:Automatic-Question-Generation
作者: bwanglzu
项目源码
文件源码
阅读 53
收藏 0
点赞 0
评论 0
def _ner_features(self, row):
"""Name entity recognition features
- Args:
row(pandas.dataframe): dataframe of current row
- Returns:
row(pandas.dataframe): result a pandas dataframe with new feature
"""
answer = row.Answer
question = row.Question
if answer is not None and question is not None:
sentence_len = len(row.Sentence.split())
ners_answer = self.st.tag(answer.split())
ners_question = self.st.tag(question.split())
ner_values_answer = [v for k, v in ners_answer if v in [
'PERSON', 'ORGANIZATION', 'LOCATION']]
ner_values_question = [v for k, v in ners_question if v in [
'PERSON', 'ORGANIZATION', 'LOCATION']]
else:
return None
# NER IN ANSWER
if 'PERSON' in ner_values_answer:
row['NAMED_ENTITY_IN_ANSWER_COUNT_PERS'] = 1
else:
row['NAMED_ENTITY_IN_ANSWER_COUNT_PERS'] = 0
if 'ORGANIZATION' in ner_values_answer:
row['NAMED_ENTITY_IN_ANSWER_COUNT_ORG'] = 1
else:
row['NAMED_ENTITY_IN_ANSWER_COUNT_ORG'] = 0
if 'LOCATION' in ner_values_answer:
row['NAMED_ENTITY_IN_ANSWER_COUNT_LOC'] = 1
else:
row['NAMED_ENTITY_IN_ANSWER_COUNT_LOC'] = 0
# NER IN QUESTION
if 'PERSON' in ner_values_question:
row['NAMED_ENTITY_OUT_ANSWER_COUNT_PERS'] = 1
else:
row['NAMED_ENTITY_OUT_ANSWER_COUNT_PERS'] = 0
if 'ORGANIZATION' in ner_values_question:
row['NAMED_ENTITY_OUT_ANSWER_COUNT_ORG'] = 1
else:
row['NAMED_ENTITY_OUT_ANSWER_COUNT_ORG'] = 0
if 'LOCATION' in ner_values_question:
row['NAMED_ENTITY_OUT_ANSWER_COUNT_LOC'] = 1
else:
row['NAMED_ENTITY_OUT_ANSWER_COUNT_LOC'] = 0
row['NUM_NAMED_ENTITIES_IN_ANSWER'] = len(ner_values_answer)
row['NUM_NAMED_ENTITIES_OUT_ANSWER'] = len(ner_values_question)
row['ANSWER_NAMED_ENTITY_DENSITY'] = float(
len(ner_values_answer)) / sentence_len
row['QUESTION_NAMED_ENTITY_DENSITY'] = float(
len(ner_values_question)) / sentence_len
return row
def check_postag(config):
train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
path = os.path.dirname(__file__)
path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
jar = path + '/stanford-postagger.jar'
model = path + '/models/english-bidirectional-distsim.tagger'
pos_tagger = StanfordPOSTagger(model, jar)
for dataset_name in config['testing_datasets']:
# override the original test_set
# test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type'])
test_sets = load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config)
test_set = test_sets[dataset_name]
# print(dataset_name)
# print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']])))
test_data_plain = zip(*(test_set['source'], test_set['target']))
test_size = len(test_data_plain)
# Alternatively to setting the CLASSPATH add the jar and model via their path:
jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
# model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'
pos_tagger = StanfordPOSTagger(model, jar)
for idx in xrange(len(test_data_plain)): # len(test_data_plain)
test_s_o, test_t_o = test_data_plain[idx]
source = keyphrase_utils.cut_zero(test_s_o, idx2word)
print(source)
# Add other jars from Stanford directory
stanford_dir = jar.rpartition('/')[0]
stanford_jars = find_jars_within_path(stanford_dir)
pos_tagger._stanford_jar = ':'.join(stanford_jars)
text = pos_tagger.tag(source)
print(text)
rer_build_history.py 文件源码
项目:Intelligent-Phone-Salesman
作者: ShruthiChari
项目源码
文件源码
阅读 15
收藏 0
点赞 0
评论 0
def build_history(data_list, supported_tags_phones,supported_tags):
history_list = [] # list of all histories
sents = []
count = 0
expected = []
for data in data_list: # data is the inputs entered by a given student
data1 = data['data']
#data1 is for every sentence entered by user
for rec in data1:
updates = rec['updates']
sent = rec['sentence']
relatedTags=[]
relations=[]
if "rels" in rec.keys():
relatedEntities = rec['rels']
expected.append(relatedEntities)
for i in relatedEntities:
relations.append(i.keys())
for j in i[i.keys()[0]]:
relatedTags.append(j)
words = []
posTaggedSent = postagger(sent)
#chunkPhrases = chunker(sent)
if len(updates) == len(posTaggedSent):
for i in range(len(updates)):
words.append({"word":updates[i]['word'],"pos":posTaggedSent[i],"tag":updates[i]['tag']})
#------------------------------------------------------------------------------------------------
# NOTE: below code is a temporary hack to build the MAxEnt for just 2 tags - we will change this later
if (updates[i]['tag'] not in supported_tags_phones):
if updates[i]['tag'] == "Model":
updates[i]['tag'] = "Version"
else:
updates[i]['tag'] = "Other"
#------------------------------------------------------------------------------------------------
sents.append(words)
history={}
history['sentence'] = words
history['i'] = count+1
#history['phrases'] = chunkPhrases
history['relatedTags'] = relatedTags
if len(relations) > 0:
history_list.append((history,relations[0][0],))
else:
history_list.append((history,"None",))
count += 1
return (history_list,sents,expected)
rer_build_history.py 文件源码
项目:Intelligent-Phone-Salesman
作者: ShruthiChari
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def chunker(sent):
#a = [("I","PRP"),("hear","VBP"),("Jerusalem","NNP"),("bells","NNS"),("ringing","VBG")]
#input_sent = " Rockwell said the agreement calls for it to supply 200 addititonal so-called shipsets for the planes."
input_sent = sent
text = nltk.word_tokenize(input_sent)
a = nltk.pos_tag(text)
phrases = []
tup = ()
'''test_sents = conll2000.chunked_sents('test.txt', chunk_types=['VP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['VP'])
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])'''
NP_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
VP_sents = conll2000.chunked_sents('train.txt', chunk_types=['VP'])
class ChunkParser(nltk.ChunkParserI):
def __init__(self, train_sents):
train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
self.tagger = nltk.TrigramTagger(train_data)
def parse(self, sentence):
pos_tags = [pos for (word,pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)]
return nltk.chunk.util.conlltags2tree(conlltags)
NPChunker = ChunkParser(NP_sents)
VPChunker = ChunkParser(VP_sents)
#print (NPChunker.parse("I hear Jerusalem bells ringing"))
parsed_sent = NPChunker.parse(a)
for i in parsed_sent:
if (type(i)!=type(tup)):
l=[]
for t in tuple(i):
l.append(t[0])
phrases.append({"NP":" ".join(l)})
parsed_sent = VPChunker.parse(a)
for i in parsed_sent:
if (type(i)!=type(tup)):
l=[]
for t in tuple(i):
l.append(t[0])
phrases.append({"VP":" ".join(l)})
return phrases