def select_index(self, compare, result='boolean'):
"""
Finds the elements in the index that match the compare parameter and returns either a list of the values that
match, of a boolean list the length of the index with True to each index that matches. If the indexes are
tuples then the compare is a tuple where None in any field of the tuple will be treated as "*" and match all
values.
:param compare: value to compare as a singleton or tuple
:param result: 'boolean' = returns a list of booleans, 'value' = returns a list of index values that match
:return: list of booleans or values
"""
if isinstance(compare, tuple):
# this crazy list comprehension will match all the tuples in the list with None being an * wildcard
booleans = [all([(compare[i] == w if compare[i] is not None else True) for i, w in enumerate(v)])
for x, v in enumerate(self._index)]
else:
booleans = [False] * len(self._index)
if self._sort:
booleans[sorted_index(self._index, compare)] = True
else:
booleans[self._index.index(compare)] = True
if result == 'boolean':
return booleans
elif result == 'value':
return list(compress(self._index, booleans))
else:
raise ValueError('only valid values for result parameter are: boolean or value.')
python类compress()的实例源码
def pass_outliers(data):
return itertools.compress(data, (z >= 3.5 for z in z_mod(data)))
def reject_outliers(data):
return itertools.compress(data, (z < 3.5 for z in z_mod(data)))
nvdm_nobatch.py 文件源码
项目:NVDM-For-Document-Classification
作者: cryanzpj
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def prediction(x_sample, y_sample): # sample has size 20
'''
Get the perplexity of the test set
'''
perplist = []
for i in range(20):
x_batch_id = [ _ for _ in itertools.compress(range(10000), map(lambda x: x>0,x_sample[0]))]
feed_dict = {nvdm.input_x: x_sample[i].reshape(1,10000)}
step, p_xi_h = sess.run([nvdm.global_step, nvdm.p_xi_h], feed_dict)
valid_p = np.mean(np.log(p_xi_h[x_batch_id]))
perplist.append(valid_p)
print("perplexity: {}".format(np.exp(-np.mean(perplist))))
nvdm_nobatch_new.py 文件源码
项目:NVDM-For-Document-Classification
作者: cryanzpj
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def train_step(x_batch, y_batch, epoch,predicts,labels):
"""
A single training step
"""
y_batch = y_batch.reshape(1,-1)
x_batch_id = [ _ for _ in itertools.compress(range(10000), map(lambda x: x>0,x_batch[0]))]
feed_dict = {nvdm.input_x: x_batch,
nvdm.input_y:y_batch,
nvdm.x_id: x_batch_id}
'''
h1b = [v for v in tf.all_variables() if v.name == "h1/b:0"][0]
h1w = [v for v in tf.all_variables() if v.name == "h1/w:0"][0]
_, step, summaries, loss, kl, rc, p_xi_h, R, hb, hw, e = sess.run(
[nvdm.train_op, global_step, loss_summary, nvdm.loss, nvdm.KL, nvdm.recon_loss, nvdm.p_xi_h, nvdm.R, h1b, h1w, nvdm.e], feed_dict)
'''
_, step, loss,predict = sess.run([nvdm.train_op, nvdm.global_step, nvdm.loss,nvdm.predicts], feed_dict)
time_str = datetime.datetime.now().isoformat()
if step % FLAGS.train_every == 0:
import pdb
pdb.set_trace()
score = f1_score_multiclass(np.array(predicts),np.array(labels))
print("time: {}, epoch: {}, step: {}, loss: {:g}, score: {:g}".format(time_str,epoch, step, loss,score))
return [],[]
predicts.append(predict)
labels.append(y_batch[0].astype(int))
return predicts,labels
if np.isnan(loss):
import pdb
pdb.set_trace()
#train_summary_writer.add_summary(summaries, step)
nvdm_nobatch_new.py 文件源码
项目:NVDM-For-Document-Classification
作者: cryanzpj
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def prediction(x_sample, y_sample): # sample has size 20
'''
Get the perplexity of the test set
'''
perplist = []
for i in range(20):
x_batch_id = [ _ for _ in itertools.compress(range(10000), map(lambda x: x>0,x_sample[0]))]
feed_dict = {nvdm.input_x: x_sample[i].reshape(1,10000),
nvdm.input_y: y_sample[i].reshape(1,103)}
step, p_xi_h = sess.run([nvdm.global_step, nvdm.p_xi_h], feed_dict)
valid_p = np.mean(np.log(p_xi_h[x_batch_id]))
perplist.append(valid_p)
print("perplexity: {}".format(np.exp(-np.mean(perplist))))
def train(self, X_train, y_train):
#self.saver.restore(self.sess, "./imdbmodel/model.ckpt")
total_batch = X_train.shape[0] // self.batch_size
for e in range(self.epoch):
perplist = []
for i in range(total_batch):
X_batch = X_train[i*self.batch_size:(i+1)*self.batch_size]
y_batch = y_train[i*self.batch_size:(i+1)*self.batch_size]
x_batch_id = [_ for _ in itertools.compress(range(self.feature_size), map(lambda x : x>0, X_batch[0].toarray()[0]))]
feed_dict = {
self.input_x : X_batch.toarray(),
self.input_y : np.reshape(y_batch, [-1,1]),
self.x_id : x_batch_id
}
_, loss = self.sess.run([
self.train_op,
self.loss], feed_dict)
if np.isnan(loss):
import pdb
pdb.set_trace()
if i % self.display_score == 0:
p_xi_h = self.sess.run([self.p_xi_h], feed_dict)
valid_p = np.mean(np.log(p_xi_h[0][x_batch_id]))
perplist.append(valid_p)
print("step: {}, perp: {:f}".format(i, np.exp(-np.mean(perplist))))
# save model every epoch
if i > 0 and i % 2000 == 0:
self.savemodel()
def _evaluate(self, individual, X, y, cv=3):
""" Evaluate method
Parameters
----------
individual: list [n_features]
The input individual to be evaluated
Return
----------
Score of the individual : turple( cross_val_score, feature score)
"""
# Select Features
features = list(compress(range(len(individual)), individual))
train = np.reshape([X[:, i] for i in features],
[len(features), len(X)]).T
if train.shape[1] == 0:
return 0,1,
# Applying K-Fold Cross Validation
accuracies = cross_val_score(estimator=clone(self.estimator), X=train,
y=y, cv=cv,
scoring=self.cv_metric_function)
if self.features_metric_function == None :
feature_score = pow(sum(individual)/(len(individual)*5), 2)
else:
feature_score = self.features_metric_function(individual)
return accuracies.mean() - accuracies.std(), feature_score
def _evaluate(self, individual, X, y, cv=3):
""" Evaluate method
Parameters
----------
individual: list [n_features]
The input individual to be evaluated
Return
----------
Score of the individual : turple( cross_val_score, feature score)
"""
# Select Features
features = list(compress(range(len(individual)), individual))
train = np.reshape([X[:, i] for i in features],
[len(features), len(X)]).T
if train.shape[1] == 0:
return 0,1,
# Applying K-Fold Cross Validation
accuracies = cross_val_score(estimator=clone(self.estimator), X=train,
y=y, cv=cv,
scoring=self.cv_metric_function)
if self.features_metric_function == "log" :
feature_score = np.log10(9*(sum(individual)/len(individual))+1)
elif self.features_metric_function == "poly" :
feature_score = sum(individual)/len(individual)
else:
raise ValueError('Unknow evaluation')
return accuracies.mean() - accuracies.std(), feature_score
def generate_batch_pvdm(doc_ids, word_ids, batch_size, window_size):
'''
Batch generator for PV-DM (Distributed Memory Model of Paragraph Vectors).
batch should be a shape of (batch_size, window_size+1)
Parameters
----------
doc_ids: list of document indices
word_ids: list of word indices
batch_size: number of words in each mini-batch
window_size: number of leading words before the target word
'''
global data_index
assert batch_size % window_size == 0
batch = np.ndarray(shape=(batch_size, window_size + 1), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = window_size + 1
buffer = collections.deque(maxlen=span) # used for collecting word_ids[data_index] in the sliding window
buffer_doc = collections.deque(maxlen=span) # collecting id of documents in the sliding window
# collect the first window of words
for _ in range(span):
buffer.append(word_ids[data_index])
buffer_doc.append(doc_ids[data_index])
data_index = (data_index + 1) % len(word_ids)
mask = [1] * span
mask[-1] = 0
i = 0
while i < batch_size:
if len(set(buffer_doc)) == 1:
doc_id = buffer_doc[-1]
# all leading words and the doc_id
batch[i, :] = list(compress(buffer, mask)) + [doc_id]
labels[i, 0] = buffer[-1] # the last word at end of the sliding window
i += 1
# move the sliding window
buffer.append(word_ids[data_index])
buffer_doc.append(doc_ids[data_index])
data_index = (data_index + 1) % len(word_ids)
return batch, labels
def generate_batch_cbow(data, batch_size, num_skips, skip_window):
'''
Batch generator for CBOW (Continuous Bag of Words).
batch should be a shape of (batch_size, num_skips)
Parameters
----------
data: list of index of words
batch_size: number of words in each mini-batch
num_skips: number of surrounding words on both direction (2: one word ahead and one word following)
skip_window: number of words at both ends of a sentence to skip (1: skip the first and last word of a sentence)
'''
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size, num_skips), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span) # used for collecting data[data_index] in the sliding window
# collect the first window of words
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
# move the sliding window
for i in range(batch_size):
mask = [1] * span
mask[skip_window] = 0
batch[i, :] = list(compress(buffer, mask)) # all surrounding words
labels[i, 0] = buffer[skip_window] # the word at the center
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
def simplified(self):
"""
Returns the reduced number of coordinates
"""
if not self.Simple_mask:
self._simplify_()
return list(itertools.compress(self, self.Simple_mask))
def afilter(function, sequence):
"""Equivalent of filter() that takes an async filter function.
Returns a list.
"""
if function is None:
result(filter(None, sequence)); return
should_include = yield [function.asynq(elt) for elt in sequence]
result(list(itertools.compress(sequence, should_include))); return
def afilterfalse(function, sequence):
"""Equivalent of itertools.ifilterfalse() that takes an async filter function.
Returns a list.
"""
should_exclude = yield [function.asynq(elt) for elt in sequence]
should_include = [not res for res in should_exclude]
result(list(itertools.compress(sequence, should_include))); return
def retrieve(self, table, cols, col_rules):
""" Retrieves column values from a single table based on a given filtering rule.
Example:
<pre lang="python">
my_db.retrieve(some_table_table,["num1","num2"],{"remainder_div_3":"{}==1 or {}==2", "sum":"{}<200"})
</pre>
will retrieve:
<pre lang="python">
columns called "num1" and "num2" from some table. That have value 1 or 2 in the ramainder_div_3 column. Column
named "sum" of which would be less than 200. All columns are combined with an "AND" statement.
</pre>
:param table: string (name of the table to retrieve from)
:param columns: list of strings (names of the columns to retrieve)
:param column_rules: dictionary of rules that will be evaluated
:return:
Nested list in which is entry in a list a a column with filtered requested values
"""
# todo: add string comp support
cursor = self.conn.cursor()
# from the table get all the columns to retrieve
sql_cmd = "select " + " ,".join(cols) + " from \"" + table + "\""
cursor.execute(sql_cmd)
sel_sets = cursor.fetchall()
if len(col_rules)==0:
sel_vals = sel_sets
else:
# from the table select all the columns to filter for
sql_cmd = "select " + ", ".join([key for key in col_rules]) + " from \"" + table + "\""
cursor.execute(sql_cmd)
filter_sets = cursor.fetchall()
# repeat every argument number of times it appears in the selection
mult = [len(re.findall("{}", col_rules[key])) for key in col_rules]
def _repeat_vals(vals, repeats):
rep_vals = []
[[rep_vals.append(vals[i]) for _ in range(repeats[i])] for i in range(len(col_rules))]
return rep_vals
filter_sets = [_repeat_vals(set, mult) for set in filter_sets]
# evaluate every row to get a boolean mask of examples
rule_tmp = "(" + ") and (".join([col_rules[key] for key in col_rules]) + ")"
sel_mask = [eval(rule_tmp.format(*val_set)) for val_set in filter_sets]
# apply a boolean mask to take only entries that fit the selection rule
sel_sets = list(compress(sel_sets, sel_mask))
sel_vals = sel_sets
#sel_vals = [list(x) for x in zip(*sel_sets)]
return sel_vals
def handle_fr_flags(arg):
def get_keys(revision):
return (
(u'??????????? ???????? ??????????', u'????? ???????? ??????')[revision],
u'???? ????? ?????????',
(u'????? ?????? ??????? ????????', u'?????? ?? ?????? ?? ??????????')[revision],
(u'????? ??????? ??????? ????????', u'?????? ?? ????? ? ?????????', u'?????? ????????')[revision],
u'???????? ????',
u'?????? ??????? ??',
u'????? ???????????? ??????? ?????',
u'????? ???????????? ??????????? ?????',
u'?????????? ?????? ??????? ?????',
u'?????????? ?????? ????????????? ???????',
u'????',
u'????????? ?????????? ?????',
u'?????? ?????? ??????????? ?????????',
u'??????? ?????? ??????????? ?????????',
u'????? ??????? ?????',
u'????? ????????????? ???????'
)
bits = misc.int_to_bits(arg, 16)
a, b, c = 0, 1, 2
flags_actual = {
# ?????-??-?
4: ((0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1), a),
# ?????-?????-??-?
9: ((0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0), a),
# ?????-?????-??-? (?????? 02)
12: ((0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0), a)
}
flags, rev = flags_actual.get(
handle_fr_flags.model,
((1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), a)
)
return dict(
zip(
itertools.compress(get_keys(rev), flags),
itertools.compress(bits, flags)
)
)
def generate_batch_pvdm(batch_size, window_size):
'''
Batch generator for PV-DM (Distributed Memory Model of Paragraph Vectors).
batch should be a shape of (batch_size, window_size+1)
Parameters
----------
batch_size: number of words in each mini-batch
window_size: number of leading words on before the target word direction
'''
global data_index
assert batch_size % window_size == 0
batch = np.ndarray(shape=(batch_size, window_size + 1), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = window_size + 1
buffer = collections.deque(maxlen=span) # used for collecting word_ids[data_index] in the sliding window
buffer_doc = collections.deque(maxlen=span) # collecting id of documents in the sliding window
# collect the first window of words
for _ in range(span):
buffer.append(word_ids[data_index])
buffer_doc.append(doc_ids[data_index])
data_index = (data_index + 1) % len(word_ids)
mask = [1] * span
mask[-1] = 0
i = 0
while i < batch_size:
if len(set(buffer_doc)) == 1:
doc_id = buffer_doc[-1]
# all leading words and the doc_id
batch[i, :] = list(compress(buffer, mask)) + [doc_id]
labels[i, 0] = buffer[-1] # the last word at end of the sliding window
i += 1
# print buffer
# print list(compress(buffer, mask))
# move the sliding window
buffer.append(word_ids[data_index])
buffer_doc.append(doc_ids[data_index])
data_index = (data_index + 1) % len(word_ids)
return batch, labels
## examinng the batch generator function