def get_svm_train_line(tree, pair, sid):
lmtzr = WordNetLemmatizer()
e1id = compact_id(pair.eids[0])
e2id = compact_id(pair.eids[1])
tree = tree.replace(pair.entities[0].tokens[0].text, 'candidatedrug')
tree = tree.replace(pair.entities[1].tokens[0].text, 'candidatedrug')
#tree = tree.replace(sid.replace('.', '').replace('-', '') + 'e', 'otherdrug')
sid2 = compact_id(sid) + 'e'
# TODO: replace other entities
#tree = rext.sub(sid2 + r'\d+', 'otherdrug', tree)
#print "tree2:", tree
if tree[0] != '(':
tree = '(S (' + tree + ' NN))'
#this depends on the version of nlkt
ptree = Tree.fromstring(tree)
#ptree = Tree.parse(tree)
leaves = list(ptree.pos())
lemmaleaves = []
for t in leaves:
pos = get_wordnet_pos(t[1])
lemma = lmtzr.lemmatize(t[0].lower(), pos)
lemmaleaves.append(lemma)
#lemmaleaves = [ for t in leaves)]
logging.debug("tree:" + tree)
line = '1 '
line += '|BT|' + tree
#bowline = '(BOW (' + ' *)('.join(lemmaleaves) + ' *)) '
#ptree = Tree.parse(bowline)
#ptree = ptree.pprint(indent=-1000)
#bowline = ptree.replace('\n', ' ')
#bowline = '|BT| ' + bowline
#if not bowline.count("otherdrug") > 8:
# line += bowline
#else:
#print "problem with BOW!"
#line += bowline
line += '|ET| '
#i = 1
#for m in docsp[ddi.PAIR_SSM_VECTOR]:
# line += " %s:%s" % (i, m)
# i += 1
#line += " 2:" + str()
#line += " |EV|"
line += '\n'
return line
评论列表
文章目录