def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
""" Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix.
:param str verb_token: Surface form of a verb, e.g., *born*
:param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer
used to transform verbs into vectors
:return: cosine similarity score
:rtype: ndarray
"""
verb_token_vector = vectorizer.transform([verb_token])
# Here the linear kernel is the same as the cosine similarity, but faster
# cf. http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
scores = linear_kernel(verb_token_vector, tf_idf_matrix)
logger.debug("Corpus-wide TF/IDF scores for '%s': %s" % (verb_token, scores))
logger.debug("Average TF/IDF score for '%s': %f" % (verb_token, average(scores)))
return scores
python类average()的实例源码
def ndcg(self, rankers, cutoff):
'''
rankers: instances of Ranker
cutoff: cutoff for nDCG
'''
result = defaultdict(list)
for q in self.docs:
documents = self.docs[q]
rels = {id(d): d.rel for d in documents}
for idx, ranker in enumerate(rankers):
res = ranker.rank(documents)
ranked_list = [id(d) for d in res]
score = ndcg(ranked_list, rels, cutoff)
result[idx].append(score)
for idx in result:
result[idx] = np.average(result[idx])
return result
def calculate_gap(predictions, actuals, top_k=20):
"""Performs a local (numpy) calculation of the global average precision.
Only the top_k predictions are taken for each of the videos.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
top_k: How many predictions to use per video.
Returns:
float: The global average precision.
"""
gap_calculator = ap_calculator.AveragePrecisionCalculator()
sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k)
gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
return gap_calculator.peek_ap_at_n()
def correlations(A,B,pc_n=100):
p = (1 - distance.correlation(A.flatten(),B.flatten()))
spear = spearmanr(A.flatten(),B.flatten())
dist_genes = np.zeros(A.shape[0])
for i in range(A.shape[0]):
dist_genes[i] = 1 - distance.correlation(A[i],B[i])
pg = (np.average(dist_genes[np.isfinite(dist_genes)]))
dist_sample = np.zeros(A.shape[1])
for i in range(A.shape[1]):
dist_sample[i] = 1 - distance.correlation(A[:,i],B[:,i])
ps = (np.average(dist_sample[np.isfinite(dist_sample)]))
pc_dist = []
if pc_n > 0:
u0,s0,vt0 = np.linalg.svd(A)
u,s,vt = np.linalg.svd(B)
for i in range(pc_n):
pc_dist.append(abs(1 - distance.cosine(u0[:,i],u[:,i])))
pc_dist = np.array(pc_dist)
return p,spear[0],pg,ps,pc_dist
def record_tabular_misc_stat(key, values, placement='back'):
if placement == 'front':
prefix = ""
suffix = key
else:
prefix = key
suffix = ""
if len(values) > 0:
record_tabular(prefix + "Average" + suffix, np.average(values))
record_tabular(prefix + "Std" + suffix, np.std(values))
record_tabular(prefix + "Median" + suffix, np.median(values))
record_tabular(prefix + "Min" + suffix, np.min(values))
record_tabular(prefix + "Max" + suffix, np.max(values))
else:
record_tabular(prefix + "Average" + suffix, np.nan)
record_tabular(prefix + "Std" + suffix, np.nan)
record_tabular(prefix + "Median" + suffix, np.nan)
record_tabular(prefix + "Min" + suffix, np.nan)
record_tabular(prefix + "Max" + suffix, np.nan)
def step(self):
"""
Half of the step of k-means
"""
if self.step_completed:
d = self.data.X
points = [d[self.clusters == i] for i in range(len(self.centroids))]
for i in range(len(self.centroids)):
c_points = points[i]
self.centroids[i, :] = (np.average(c_points, axis=0)
if len(c_points) > 0 else np.nan)
# reinitialize empty centroids
nan_c = np.isnan(self.centroids).any(axis=1)
if np.count_nonzero(nan_c) > 0:
self.centroids[nan_c] = self.random_positioning(
np.count_nonzero(nan_c))
self.centroids_moved = True
else:
self.clusters = self.find_clusters(self.centroids)
self.centroids_moved = False
self.step_no += 1
self.centroids_history = self.set_list(
self.centroids_history, self.step_no, np.copy(self.centroids))
def solint_numpy_indexing(dsref):
start = time.time()
dsref.as_numarray()
tms = numpy.unique(dsref.x)
# check if there is something to be averaged at all
if len(tms)==len(dsref.x):
return time.time() - start
newds = dataset()
for tm in tms:
newds.append(tm, numpy.average(dsref.y[numpy.where(dsref.x==tm)]) )
dsref.x = newds.x
dsref.y = newds.y
return time.time() - start
def solint_pure_python3(dsref):
start = time.time()
tms = set(dsref.x)
# check if there is something to be averaged at all
if len(tms)==len(dsref.x):
return time.time() - start
# accumulate data into bins of the same time
r = reduce(lambda acc, (tm, y): acc[tm].add(y) or acc, \
itertools.izip(dsref.x, dsref.y), \
collections.defaultdict(average))
# do the averaging
(x, y) = reduce(lambda (xl, yl), (tm, ys): (xl.append(tm) or xl, yl.append(ys.avg()) or yl), \
r.iteritems(), (list(), list()))
dsref.x = x
dsref.y = y
return time.time() - start
def calculate_gap(predictions, actuals, top_k=20):
"""Performs a local (numpy) calculation of the global average precision.
Only the top_k predictions are taken for each of the videos.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
top_k: How many predictions to use per video.
Returns:
float: The global average precision.
"""
gap_calculator = ap_calculator.AveragePrecisionCalculator()
sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k)
gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
return gap_calculator.peek_ap_at_n()
def calculate_gap(predictions, actuals, top_k=20):
"""Performs a local (numpy) calculation of the global average precision.
Only the top_k predictions are taken for each of the videos.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
top_k: How many predictions to use per video.
Returns:
float: The global average precision.
"""
gap_calculator = ap_calculator.AveragePrecisionCalculator()
sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k)
gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
return gap_calculator.peek_ap_at_n()
def markdown_overall_speedups(_type, _timing, r_benchmarks):
txt_geomean = ' Geometeric mean :: '
txt_avg = ' Average :: '
txt_max = ' Maximum :: '
for _interp in r_benchmarks:
txt_geomean += _interp + ': `' + ("%.3f" % geomean(r_benchmarks[_interp]) ) + 'x`, '
txt_avg += _interp + ': `' + ("%.3f" % np.average(r_benchmarks[_interp])) + 'x`, '
txt_max += _interp + ': `' + ("%.3f" % max(r_benchmarks[_interp]) ) + 'x`, '
if _interp not in benchmarks_stats_overall:
benchmarks_stats_overall[_interp] = {}
if _timing not in benchmarks_stats_overall[_interp]:
benchmarks_stats_overall[_interp][_timing] = []
benchmarks_stats_overall[_interp][_timing] += r_benchmarks[_interp]
txt_geomean += '\n\n'
txt_avg += '\n\n'
txt_max += '\n\n'
if _type not in benchmarks_stats_types:
benchmarks_stats_types[_type] = {}
benchmarks_stats_types[_type][_timing] = [txt_geomean, txt_avg, txt_max]
def calculate_hit_at_one(predictions, actuals):
"""Performs a local (numpy) calculation of the hit at one.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
Returns:
float: The average hit at one across the entire batch.
"""
top_prediction = numpy.argmax(predictions, 1)
hits = actuals[numpy.arange(actuals.shape[0]), top_prediction]
return numpy.average(hits)
def calculate_precision_at_equal_recall_rate(predictions, actuals):
"""Performs a local (numpy) calculation of the PERR.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
Returns:
float: The average precision at equal recall rate across the entire batch.
"""
aggregated_precision = 0.0
num_videos = actuals.shape[0]
for row in numpy.arange(num_videos):
num_labels = int(numpy.sum(actuals[row]))
top_indices = numpy.argpartition(predictions[row],
-num_labels)[-num_labels:]
item_precision = 0.0
for label_index in top_indices:
if predictions[row][label_index] > 0:
item_precision += actuals[row][label_index]
item_precision /= top_indices.size
aggregated_precision += item_precision
aggregated_precision /= num_videos
return aggregated_precision
def calculate_hit_at_one(predictions, actuals):
"""Performs a local (numpy) calculation of the hit at one.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
Returns:
float: The average hit at one across the entire batch.
"""
top_prediction = numpy.argmax(predictions, 1)
hits = actuals[numpy.arange(actuals.shape[0]), top_prediction]
return numpy.average(hits)
def calculate_precision_at_equal_recall_rate(predictions, actuals):
"""Performs a local (numpy) calculation of the PERR.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
Returns:
float: The average precision at equal recall rate across the entire batch.
"""
aggregated_precision = 0.0
num_videos = actuals.shape[0]
for row in numpy.arange(num_videos):
num_labels = int(numpy.sum(actuals[row]))
top_indices = numpy.argpartition(predictions[row],
-num_labels)[-num_labels:]
item_precision = 0.0
for label_index in top_indices:
if predictions[row][label_index] > 0:
item_precision += actuals[row][label_index]
item_precision /= top_indices.size
aggregated_precision += item_precision
aggregated_precision /= num_videos
return aggregated_precision
def calculate_hit_at_one(predictions, actuals):
"""Performs a local (numpy) calculation of the hit at one.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
Returns:
float: The average hit at one across the entire batch.
"""
top_prediction = numpy.argmax(predictions, 1)
hits = actuals[numpy.arange(actuals.shape[0]), top_prediction]
return numpy.average(hits)
def calculate_precision_at_equal_recall_rate(predictions, actuals):
"""Performs a local (numpy) calculation of the PERR.
Args:
predictions: Matrix containing the outputs of the model.
Dimensions are 'batch' x 'num_classes'.
actuals: Matrix containing the ground truth labels.
Dimensions are 'batch' x 'num_classes'.
Returns:
float: The average precision at equal recall rate across the entire batch.
"""
aggregated_precision = 0.0
num_videos = actuals.shape[0]
for row in numpy.arange(num_videos):
num_labels = int(numpy.sum(actuals[row]))
top_indices = numpy.argpartition(predictions[row],
-num_labels)[-num_labels:]
item_precision = 0.0
for label_index in top_indices:
if predictions[row][label_index] > 0:
item_precision += actuals[row][label_index]
item_precision /= top_indices.size
aggregated_precision += item_precision
aggregated_precision /= num_videos
return aggregated_precision
def get_regression_data(name, split, data_path=data_path):
path = '{}{}.csv'.format(data_path, name)
if not os.path.isfile(path):
download(name +'.csv', data_path=data_path)
data = pandas.read_csv(path, header=None).values
if name in ['energy', 'naval']:
# there are two Ys for these, but take only the first
X_full = data[:, :-2]
Y_full = data[:, -2]
else:
X_full = data[:, :-1]
Y_full = data[:, -1]
X, Y, Xs, Ys = make_split(X_full, Y_full, split)
############# whiten inputs
X_mean, X_std = np.average(X, 0), np.std(X, 0)+1e-6
X = (X - X_mean)/X_std
Xs = (Xs - X_mean)/X_std
return X, Y[:, None], Xs, Ys[:, None]
def test_weighted_average(self):
""" Test results of weighted average against numpy.average """
stream = [np.random.random(size = (16,16)) for _ in range(5)]
with self.subTest('float weights'):
weights = [random() for _ in stream]
from_iaverage = last(iaverage(stream, weights = weights))
from_numpy = np.average(np.dstack(stream), axis = 2, weights = np.array(weights))
self.assertTrue(np.allclose(from_iaverage, from_numpy))
with self.subTest('array weights'):
weights = [np.random.random(size = stream[0].shape) for _ in stream]
from_iaverage = last(iaverage(stream, weights = weights))
from_numpy = np.average(np.dstack(stream), axis = 2, weights = np.dstack(weights))
self.assertTrue(np.allclose(from_iaverage, from_numpy))