def init_estimator(self):
indices = [i for i in np.random.choice(X.shape[0], self.n_samples, p=self.weights)]
X_tree = np.array([X[i, :] for i in indices])
y_tree = np.array([y[i] for i in indices])
print "%s / %s" % (self.count, self.n_estimators)
while True:
t1 = time.time()
tree = Tree(X_tree, y_tree)
t2 = time.time()
print "tree generation time: %s" % (t2 - t1)
predictions = tree.predict(self.X)
accuracy = accuracy_score(self.y, predictions)
print "accuracy: %s" % accuracy
if accuracy != 0.50:
self.estimators.append(tree)
break
return tree, predictions
python类Tree()的实例源码
def _get_dependencies(self, tree, sent_len):
def rec(subtree):
if isinstance(subtree, Tree):
children = subtree.children
if len(children) == 2:
head = rec(children[0 if subtree.left_is_head else 1])
dep = rec(children[1 if subtree.left_is_head else 0])
res[dep] = head
else:
head = rec(children[0])
return head
else:
return subtree.pos
res = [-1 for _ in range(sent_len)]
rec(tree)
res = [i + 1 for i in res]
assert len(filter(lambda i:i == 0, res)) == 1
return res
def loadDataset(filename):
"""
Load and return the dataset given in parameter
"""
dataFile = open(filename, 'r')
lines = dataFile.readlines()
dataFile.close()
dataset = []
# Extract rows
for line in lines:
dataset.append(tree.Tree(line)) # Create the tree for each sentence
return dataset
def read_tree(self, line):
parents = list(map(int, line.split()))
trees = dict()
root = None
for i in range(1, len(parents) + 1):
if i - 1 not in trees.keys() and parents[i - 1] != -1:
idx = i
prev = None
while True:
parent = parents[idx - 1]
if parent == -1:
break
tree = Tree()
if prev is not None:
tree.add_child(prev)
trees[idx - 1] = tree
tree.idx = idx - 1
if parent - 1 in trees.keys():
trees[parent - 1].add_child(tree)
break
elif parent == 0:
root = tree
break
else:
prev = tree
idx = parent
return root
def load(txt_fn):
hrchy = tt.Tree()
with open(txt_fn, 'rb') as fp:
while True:
node = readNode(fp)
if not node:
break
if node['parent_id'] == 'root':
hrchy.create_node({'desc': node['desc'], 'classes': node['classes']}, node['id'])
else:
hrchy.create_node({'desc': node['desc'], 'classes': node['classes']}, node['id'], node['parent_id'])
return hrchy
def read_tree(self, line):
parents = list(map(int, line.split()))
trees = dict()
root = None
for i in range(1, len(parents) + 1):
# if not trees[i-1] and parents[i-1]!=-1:
if i - 1 not in trees.keys() and parents[i - 1] != -1:
idx = i
prev = None
while True:
parent = parents[idx - 1]
if parent == -1:
break
tree = Tree()
if prev is not None:
tree.add_child(prev)
trees[idx - 1] = tree
tree.idx = idx - 1
# if trees[parent-1] is not None:
if parent - 1 in trees.keys():
trees[parent - 1].add_child(tree)
break
elif parent == 0:
root = tree
break
else:
prev = tree
idx = parent
return root
def load_data(data_dir, order='top_down'):
'''construct vocab and load data with a specified traversal order'''
general_predicate_dir = os.path.join(data_dir, "general_predicate")
general_predicate = []
with open(general_predicate_dir, 'r') as f:
general_predicate = f.read().split('\n')
word_vocab = Vocab()
nt_vocab = Vocab()
ter_vocab = Vocab()
act_vocab = Vocab()
word_tokens = collections.defaultdict(list)
tree_tokens = collections.defaultdict(list)
tran_actions = collections.defaultdict(list)
for fname in ('train', 'valid', 'test'):
print('reading', fname)
pname = os.path.join(data_dir, fname)
with codecs.open(pname, 'r', 'utf-8') as f:
for line in f:
sen, sexp = line.rstrip().split('\t')
sen = sen.split(' ')
word_vocab.feed_all(sen)
word_tokens[fname].append(sen)
parse_tree = Tree()
parse_tree.construct_from_sexp(sexp)
nt, ter = parse_tree.get_nt_ter()
nt_vocab.feed_all(nt)
ter_vocab.feed_all(ter)
tree_token, action = parse_tree.get_oracle(order, general_predicate)
act_vocab.feed_all(action)
tree_tokens[fname].append(tree_token)
tran_actions[fname].append(action)
return word_vocab, nt_vocab, ter_vocab, act_vocab, word_tokens, tree_tokens, tran_actions
def load_data(data_dir, order='pre_order'):
'''construct vocab and load data with a specified traversal order'''
word_vocab = Vocab()
nt_vocab = Vocab()
ter_vocab = Vocab()
act_vocab = Vocab()
act_vocab.feed_all(['NT', 'TER', 'ACT'])
word_tokens = collections.defaultdict(list)
tree_tokens = collections.defaultdict(list)
tran_actions = collections.defaultdict(list)
for fname in ('train', 'valid', 'test'):
print('reading', fname)
pname = os.path.join(data_dir, fname)
with codecs.open(pname, 'r', 'utf-8') as f:
for line in f:
sen, sexp = line.rstrip().split('\t')
sen = sen.split(' ')
word_vocab.feed_all(sen)
word_tokens[fname].append(sen)
parse_tree = Tree()
parse_tree.construct_from_sexp(sexp)
nt, ter = parse_tree.get_nt_ter()
nt_vocab.feed_all(nt)
ter_vocab.feed_all(ter)
traverse_method = getattr(parse_tree, order)
tree_token, action = traverse_method(_ROOT)
tree_tokens[fname].append(tree_token)
tran_actions[fname].append(action)
return word_vocab, nt_vocab, ter_vocab, act_vocab, word_tokens, tree_tokens, tran_actions
def __init__(self, sent_id, parse_tree, dep_tree, words):
self.leaves = []
self.id = sent_id
self.tree = Tree(parse_tree, sent_id)
self.get_leaves()
self.words = words
self.begin_offset = words[0][1]['CharacterOffsetBegin']
self.end_offset = words[-1][1]['CharacterOffsetEnd']
self.word_ids = []
self.true_connectives = []
self.checked_connectives = []
self.stem_leaf()
self.depTree = DepTree(self, dep_tree)
self.clauses = []
self.break_clauses()
def __init__(self, sent_id, parse_tree, dep_tree, words):
self.leaves = []
self.id = sent_id
self.tree = Tree(parse_tree, sent_id)
self.get_leaves()
self.words = words
self.begin_offset = words[0][1]['CharacterOffsetBegin']
self.end_offset = words[-1][1]['CharacterOffsetEnd']
self.word_ids = []
self.true_connectives = []
self.checked_connectives = []
self.depTree = DepTree(self, dep_tree)
self.clauses = []
self.break_clauses()
def build_forest(self):
forest = {}
for t in range(self.f_size):
forest[t] = Tree(self, rho=self.rho)
forest[t].tree_leaf_plots(fname='tree_opt%s.png'%t)
path = os.getcwd() + '/plots/'
mkdir_p(path)
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
color = ['lightcoral', 'dodgerblue', 'mediumseagreen', 'darkorange']
for t in range(self.f_size):
for c, n in enumerate(forest[t].leaf_nodes):
[[i1, i2], [j1, j2]] = n.quad
x1, x2 = self.grid[0][i1], self.grid[0][i2]
y1, y2 = self.grid[1][j1], self.grid[1][j2]
ax.fill_between([x1,x2], y1, y2, alpha=.15, color=color[c])
pd.DataFrame(self.data, columns=['x', 'y']).plot(ax=ax, x='x', y='y', kind='scatter', lw=0, alpha=.6, s=20, c='k')
plt.savefig(path + 'combined.png', format='png')
plt.close()
return forest
# Implement Online L-curve optimization like EWMA to get rid of input depth
def tune_entropy_threshold(self, n=5, depth=6, plot_debug=False):
"""
Compute mean optimal entropy based on L-curve elbow method.
"""
e_arr = []
for i in range(n):
var = Tree(self, rho=.5, depth=depth)
e_arr += [pair + [i] for pair in var.entropy_gain_evol]
var.domain_splits_plots(subpath='%s/'%i)
entropy_evol = pd.DataFrame(e_arr, columns=['depth', 'entropy', 'tree'])
entropy_evol = entropy_evol.groupby(['tree', 'depth'])[['entropy']].mean().reset_index().pivot(columns='tree', index='depth', values='entropy').fillna(0)
entropy_elbow_cand = entropy_evol.apply(lambda x: opt_L_curve(np.array(x.index), np.array(x)))
avg_opt_entropy = entropy_elbow_cand.mean()
if plot_debug:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
entropy_evol.plot(ax=ax, kind='line', alpha=.6, lw=3., title='Avg. Opt. Entropy = %.2f'%avg_opt_entropy)
plt.savefig('evol.png', format='png')
plt.close()
return avg_opt_entropy
def validate(self, tree, key, value):
if isinstance(value, Tree):
p = '.'.join(tree._path + [key])
raise ValidationError(p, 'This key must be a value, not a tree.') from None
def read_tree(self, line):
parents = map(int,line.split())
trees = dict()
root = None
for i in xrange(1,len(parents)+1):
#if not trees[i-1] and parents[i-1]!=-1:
if i-1 not in trees.keys() and parents[i-1]!=-1:
idx = i
prev = None
while True:
parent = parents[idx-1]
if parent == -1:
break
tree = Tree()
if prev is not None:
tree.add_child(prev)
trees[idx-1] = tree
tree.idx = idx-1
#if trees[parent-1] is not None:
if parent-1 in trees.keys():
trees[parent-1].add_child(tree)
break
elif parent==0:
root = tree
break
else:
prev = tree
idx = parent
return root
def read_tree(self, line, label_line):
# FIXED: tree.idx, also tree dict() use base 1 as it was in dataset
# parents is list base 0, keep idx-1
# labels is list base 0, keep idx-1
parents = map(int,line.split()) # split each number and turn to int
trees = dict() # this is dict
root = None
labels = map(self.parse_dlabel_token, label_line.split())
for i in xrange(1,len(parents)+1):
#if not trees[i-1] and parents[i-1]!=-1:
if i not in trees.keys() and parents[i-1]!=-1:
idx = i
prev = None
while True:
parent = parents[idx-1]
if parent == -1:
break
tree = Tree()
if prev is not None:
tree.add_child(prev)
trees[idx] = tree
tree.idx = idx # -1 remove -1 here to prevent embs[tree.idx -1] = -1 while tree.idx = 0
tree.gold_label = labels[idx-1] # add node label
#if trees[parent-1] is not None:
if parent in trees.keys():
trees[parent].add_child(tree)
break
elif parent==0:
root = tree
break
else:
prev = tree
idx = parent
return root
def testCheckGradient():
"""
Gradient checking by comparing to an approximation value
"""
# Create an arbitrary sample
sample = tree.Tree("(4 (2 (2 But) (2 (3 (3 (2 believe) (2 it)) (2 or)) (1 not))) (4 (2 ,) (4 (2 it) (4 (4 (2 's) (4 (2 one) (4 (2 of) (4 (4 (2 the) (4 (4 (2 most) (4 (4 beautiful) (3 (2 ,) (3 evocative)))) (2 works))) (2 (2 I) (2 (2 've) (2 seen))))))) (2 .)))))")
#sample.printTree() # Check parsing and sample loading
# Initialize the model
model = rntnmodel.Model(
randInitMaxValueNN = 2.0, # Try bigger values for the initial values
#regularisationTerm = 0 # Check without regularisation
regularisationTerm = 0.02 # Check gradient with regularisation
)
# Compute the gradient using the direct formula
model.evaluateSample(sample)
analyticGradient = model.backpropagate(sample)
analyticGradient = model.addRegularisation(analyticGradient, 1) # Don't forget to add the regularisation
# Compute the gradient using the numerical approximation
numericalGradient = computeNumericalGradient(sample, model)
# Show results (detailled values)
print("Computed dV[3]=\n", numericalGradient.dV[3])
print("Numerical dV[3]=\n", analyticGradient.dV[3]) # We plot a random layer instead of the whole tensor
print("Computed dW=\n", numericalGradient.dW)
print("Numerical dW=\n", analyticGradient.dW)
print("Computed db=\n", numericalGradient.db)
print("Numerical db=\n", analyticGradient.db)
print("Computed dWs=\n", numericalGradient.dWs)
print("Numerical dWs=\n", analyticGradient.dWs)
print("Computed dbs=\n", numericalGradient.dbs)
print("Numerical dbs=\n", analyticGradient.dbs)
# Show results (distance)
distV = np.linalg.norm(analyticGradient.dV - numericalGradient.dV) / np.linalg.norm(analyticGradient.dV + numericalGradient.dV)
distW = np.linalg.norm(analyticGradient.dW - numericalGradient.dW) / np.linalg.norm(analyticGradient.dW + numericalGradient.dW)
distb = np.linalg.norm(analyticGradient.db - numericalGradient.db) / np.linalg.norm(analyticGradient.db + numericalGradient.db)
distWs = np.linalg.norm(analyticGradient.dWs - numericalGradient.dWs) / np.linalg.norm(analyticGradient.dWs + numericalGradient.dWs)
distbs = np.linalg.norm(analyticGradient.dbs - numericalGradient.dbs) / np.linalg.norm(analyticGradient.dbs + numericalGradient.dbs)
print("Distances: dV=", distV)
print("Distances: dW=", distW)
print("Distances: db=", distb)
print("Distances: dWs=", distWs)
print("Distances: dbs=", distbs)
def load_data(data_dir, order='top_down'):
'''construct vocab and load data with a specified traversal order'''
general_predicate_dir = os.path.join(data_dir, "general_nts")
action_dir = os.path.join(data_dir, "actions")
general_predicate = []
word_vocab = Vocab()
nt_vocab = Vocab()
ter_vocab = Vocab()
act_vocab = Vocab()
with open(general_predicate_dir, 'r') as f:
general_predicate = f.read().split('\n')
nt_vocab.feed_all(general_predicate)
with open(action_dir, 'r') as f:
actions = f.read().split('\n')
act_vocab.feed_all(actions)
word_tokens = collections.defaultdict(list)
tree_tokens = collections.defaultdict(list)
tran_actions = collections.defaultdict(list)
for fname in ('train', 'valid', 'test'):
print('reading', fname)
pname = os.path.join(data_dir, fname)
with codecs.open(pname, 'r', 'utf-8') as f:
for line in f:
sen, sexp = line.rstrip().split('\t')
sen = sen.split(' ')
word_vocab.feed_all(sen)
word_tokens[fname].append(sen)
parse_tree = Tree()
parse_tree.construct_from_sexp(sexp)
nt, ter = parse_tree.get_nt_ter()
nt_vocab.feed_all(nt)
ter_vocab.feed_all(ter)
tree_token, action = parse_tree.get_oracle(order, general_predicate)
#print (tree_token, action)
tree_tokens[fname].append(tree_token)
tran_actions[fname].append(action)
return word_vocab, nt_vocab, ter_vocab, act_vocab, word_tokens, tree_tokens, tran_actions
def convert_graph(data_dir):
_allowed_error = 0.000001
rname = os.path.join(data_dir, 'train_lf_spade')
rf = open(rname, 'w')
for fname in ['spades.bow.graphs.train.json']:
print('reading', fname)
pname = os.path.join(data_dir, fname)
with codecs.open(pname, 'r', 'utf-8') as f:
for line in f:
try:
line = json.loads(line)
except:
continue
sen = line['words']
sen = [x['word'] for x in sen]
forest, answer = line['graphs'], line['answerString']
if not line.has_key('entities'): continue
entity_list = line['entities']
good_lf = []
bad_lf = []
if len(forest) == 0:
continue
find_lf = 0
for graph in forest:
lf = graph2lf(graph['graph'], entity_list)
if lf is None:
continue
parse_tree = Tree()
parse_tree.construct_from_sexp(lf)
find_lf = 1
nt, ter = parse_tree.get_nt_ter()
if set(graph['denotation']) & set(answer):
good_lf.append((lf, graph['denotation']))
else:
bad_lf.append((lf, graph['denotation']))
if not find_lf:
continue
json.dump(sen, rf)
rf.write('\t')
json.dump(answer, rf)
rf.write('\t')
json.dump(good_lf, rf)
rf.write('\t')
json.dump(bad_lf, rf)
rf.write('\n')
def convert_graph(data_dir):
_allowed_error = 0.000001
rname = os.path.join(data_dir, 'train_lf')
rf = open(rname, 'w')
for fname in ('train.graph', 'valid.graph'):
print('reading', fname)
pname = os.path.join(data_dir, fname)
with codecs.open(pname, 'r', 'utf-8') as f:
for line in f:
line = json.loads(line)
sen = line['sentence']
sen = sen.split(' ')
forest, answer = line['forest'], line['answerF1']
good_lf = []
bad_lf = []
for choice in forest:
entity_list = choice['entities']
for graph in choice['graphs']:
lf = graph2lf(graph['graph'], entity_list)
parse_tree = Tree()
parse_tree.construct_from_sexp(lf)
nt, ter = parse_tree.get_nt_ter()
if set(graph['denotation']) & set(answer):
good_lf.append((lf, graph['denotation']))
else:
bad_lf.append((lf, graph['denotation']))
json.dump(sen, rf)
rf.write('\t')
json.dump(answer, rf)
rf.write('\t')
json.dump(good_lf, rf)
rf.write('\t')
json.dump(bad_lf, rf)
rf.write('\n')
def load_data(data_dir, order='top_down'):
'''construct vocab and load data with a specified traversal order'''
general_predicate_dir = os.path.join(data_dir, "general_nts")
action_dir = os.path.join(data_dir, "actions")
general_predicate = []
word_vocab = Vocab()
nt_vocab = Vocab()
ter_vocab = Vocab()
act_vocab = Vocab()
with codecs.open(general_predicate_dir, 'r', 'utf-8') as f:
general_predicate = f.read().split('\n')
nt_vocab.feed_all(general_predicate)
with codecs.open(action_dir, 'r', 'utf-8') as f:
actions = f.read().split('\n')
act_vocab.feed_all(actions)
word_tokens = collections.defaultdict(list)
tree_tokens = collections.defaultdict(list)
tran_actions = collections.defaultdict(list)
for fname in ('train', 'valid', 'test'):
print('reading', fname)
pname = os.path.join(data_dir, fname)
with codecs.open(pname, 'r', 'utf-8') as f:
for line in f:
sen, sexp = line.rstrip().split('\t')
sen = sen.split(' ')
word_vocab.feed_all(sen)
word_tokens[fname].append(sen)
parse_tree = Tree()
parse_tree.construct_from_sexp(sexp)
nt, ter = parse_tree.get_nt_ter()
nt_vocab.feed_all(nt)
ter_vocab.feed_all(ter)
tree_token, action = parse_tree.get_oracle(order, general_predicate)
act_vocab.feed_all(action)
#print (tree_token, action)
tree_tokens[fname].append(tree_token)
tran_actions[fname].append(action)
return word_vocab, nt_vocab, ter_vocab, act_vocab, word_tokens, tree_tokens, tran_actions
def __init__(self, world, x, y, w, h, tile):
pygame.sprite.Sprite.__init__(self)
self.world = world
self.x = x
self.y = y
self.w = w
self.h = h
self.tile = tile
self.max_food = 10
self.fertility_mult = 0.5
self.colour = (255, 0, 0)
if self.tile.terrain == 'meadow':
self.fertility_mult = 0.0025
self.colour = (80, 180, 80)
self.max_food = 2
elif self.tile.terrain == 'lake':
self.fertility_mult = 0
self.colour = (0, 0, 215)
self.max_food = 0
elif self.tile.terrain == 'forest':
self.fertility_mult = 0.005
self.colour = (0, 120, 0)
self.max_food = 10
else:
print('unknown terrain type: %r' % self.tile)
self.image = pygame.Surface((self.w, self.h)).convert()
self.image.fill((0,0,255))
self.redraw = True
self.rect = self.image.get_rect()
self.rect.x = self.x * self.w
self.rect.y = self.y * self.h
self.alltrees = group.Group()
self.allfood = group.Group()
self.allcharacters = group.Group()
if tile.terrain == 'forest':
t = tree.Tree(
self,
random.randint(4, 18), # radius
random.randint(0, w), # x
random.randint(0, h)) # y
self.alltrees.add(t)
world.alltrees.add(t)