def test_dtype(self):
dt = np.intc
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.array([1], dt)))
dt = '<i4'
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.array([1], dt)))
dt = np.dtype('>i4')
p = ndpointer(dtype=dt)
p.from_param(np.array([1], dt))
self.assertRaises(TypeError, p.from_param,
np.array([1], dt.newbyteorder('swap')))
dtnames = ['x', 'y']
dtformats = [np.intc, np.float64]
dtdescr = {'names': dtnames, 'formats': dtformats}
dt = np.dtype(dtdescr)
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.zeros((10,), dt)))
samedt = np.dtype(dtdescr)
p = ndpointer(dtype=samedt)
self.assertTrue(p.from_param(np.zeros((10,), dt)))
dt2 = np.dtype(dtdescr, align=True)
if dt.itemsize != dt2.itemsize:
self.assertRaises(TypeError, p.from_param, np.zeros((10,), dt2))
else:
self.assertTrue(p.from_param(np.zeros((10,), dt2)))
python类intc()的实例源码
test_ctypeslib.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def __init__(self, bins, mapq_thresh=30, clip_thresh=1):
# set parameters
self.bins = bins
self.mapQT = mapq_thresh
self.clip_thresh = clip_thresh
# initialise data structures
self.depth_stats = DepthStats(bins, mapq_thresh=mapq_thresh, dtype=np.intc)
self.aln_stats = np.zeros((bins.num, len(AlignStats.aln_stats_cols)), dtype=np.intc)
self.fwd_inserts = np.empty(bins.num, dtype=list)
self.rvs_inserts = np.empty(bins.num, dtype=list)
for j in range(0, bins.num):
self.fwd_inserts[j] = []
self.rvs_inserts[j] = []
def generate_data(n_samples, n_features, size_groups, rho=0.5,
random_state=24):
""" Data generation process with Toplitz like correlated features:
this correspond to the synthetic dataset used in our paper
"GAP Safe Screening Rules for Sparse-Group Lasso".
"""
rng = check_random_state(random_state)
n_groups = len(size_groups)
# g_start = np.zeros(n_groups, order='F', dtype=np.intc)
# for i in range(1, n_groups):
# g_start[i] = size_groups[i - 1] + g_start[i - 1]
g_start = np.cumsum(size_groups, dtype=np.intc) - size_groups[0]
# 10% of groups are actives
gamma1 = int(np.ceil(n_groups * 0.1))
selected_groups = rng.random_integers(0, n_groups - 1, gamma1)
true_beta = np.zeros(n_features)
for i in selected_groups:
begin = g_start[i]
end = g_start[i] + size_groups[i]
# 10% of features are actives
gamma2 = int(np.ceil(size_groups[i] * 0.1))
selected_features = rng.random_integers(begin, end - 1, gamma2)
ns = len(selected_features)
s = 2 * rng.rand(ns) - 1
u = rng.rand(ns)
true_beta[selected_features] = np.sign(s) * (10 * u + (1 - u) * 0.5)
vect = rho ** np.arange(n_features)
covar = toeplitz(vect, vect)
X = rng.multivariate_normal(np.zeros(n_features), covar, n_samples)
y = np.dot(X, true_beta) + 0.01 * rng.normal(0, 1, n_samples)
return X, y
def test_dtype(self):
dt = np.intc
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.array([1], dt)))
dt = '<i4'
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.array([1], dt)))
dt = np.dtype('>i4')
p = ndpointer(dtype=dt)
p.from_param(np.array([1], dt))
self.assertRaises(TypeError, p.from_param,
np.array([1], dt.newbyteorder('swap')))
dtnames = ['x', 'y']
dtformats = [np.intc, np.float64]
dtdescr = {'names': dtnames, 'formats': dtformats}
dt = np.dtype(dtdescr)
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.zeros((10,), dt)))
samedt = np.dtype(dtdescr)
p = ndpointer(dtype=samedt)
self.assertTrue(p.from_param(np.zeros((10,), dt)))
dt2 = np.dtype(dtdescr, align=True)
if dt.itemsize != dt2.itemsize:
self.assertRaises(TypeError, p.from_param, np.zeros((10,), dt2))
else:
self.assertTrue(p.from_param(np.zeros((10,), dt2)))
def expected_support():
numpy_datatypes = [numpy.bool_, numpy.bool, numpy.int_,
numpy.intc, numpy.intp, numpy.int8,
numpy.int16, numpy.int32, numpy.int64,
numpy.uint8, numpy.uint16, numpy.uint32,
numpy.uint64, numpy.float_, numpy.float16,
numpy.float32, numpy.float64]
python_datatypes = [bool, int, float, object]
return numpy_datatypes + python_datatypes
def test_dtype(self):
dt = np.intc
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.array([1], dt)))
dt = '<i4'
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.array([1], dt)))
dt = np.dtype('>i4')
p = ndpointer(dtype=dt)
p.from_param(np.array([1], dt))
self.assertRaises(TypeError, p.from_param,
np.array([1], dt.newbyteorder('swap')))
dtnames = ['x', 'y']
dtformats = [np.intc, np.float64]
dtdescr = {'names': dtnames, 'formats': dtformats}
dt = np.dtype(dtdescr)
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.zeros((10,), dt)))
samedt = np.dtype(dtdescr)
p = ndpointer(dtype=samedt)
self.assertTrue(p.from_param(np.zeros((10,), dt)))
dt2 = np.dtype(dtdescr, align=True)
if dt.itemsize != dt2.itemsize:
self.assertRaises(TypeError, p.from_param, np.zeros((10,), dt2))
else:
self.assertTrue(p.from_param(np.zeros((10,), dt2)))
def test_dtype(self):
dt = np.intc
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.array([1], dt)))
dt = '<i4'
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.array([1], dt)))
dt = np.dtype('>i4')
p = ndpointer(dtype=dt)
p.from_param(np.array([1], dt))
self.assertRaises(TypeError, p.from_param,
np.array([1], dt.newbyteorder('swap')))
dtnames = ['x', 'y']
dtformats = [np.intc, np.float64]
dtdescr = {'names': dtnames, 'formats': dtformats}
dt = np.dtype(dtdescr)
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.zeros((10,), dt)))
samedt = np.dtype(dtdescr)
p = ndpointer(dtype=samedt)
self.assertTrue(p.from_param(np.zeros((10,), dt)))
dt2 = np.dtype(dtdescr, align=True)
if dt.itemsize != dt2.itemsize:
self.assertRaises(TypeError, p.from_param, np.zeros((10,), dt2))
else:
self.assertTrue(p.from_param(np.zeros((10,), dt2)))
def predict_rankings(self, queries, compact=False, n_jobs=1):
'''
Predict rankings of the documents for the given queries.
If `compact` is set to True then the output will be one
long 1d array containing the rankings for all the queries
instead of a list of 1d arrays.
The compact array can be subsequently index using query
index pointer array, see `queries.query_indptr`.
query: Query
The query whose documents should be ranked.
compact: bool
Specify to return rankings in compact format.
n_jobs: int, optional (default is 1)
The number of working threads that will be spawned to compute
the ranking scores. If -1, the current number of CPUs will be used.
'''
# Predict the ranking scores for the documents.
predictions = self.predict(queries, n_jobs)
rankings = np.zeros(queries.document_count(), dtype=np.intc)
ranksort_queries(queries.query_indptr, predictions, rankings)
if compact or len(queries) == 1:
return rankings
else:
return np.array_split(rankings, queries.query_indptr[1:-1])
def predict_rankings(self, queries, compact=False, n_jobs=1):
'''
Predict rankings of the documents for the given queries.
If `compact` is set to True then the output will be one
long 1d array containing the rankings for all the queries
instead of a list of 1d arrays.
The compact array can be subsequently index using query
index pointer array, see `queries.query_indptr`.
query: Query
The query whose documents should be ranked.
compact: bool
Specify to return rankings in compact format.
n_jobs: int, optional (default is 1)
The number of working threads that will be spawned to compute
the ranking scores. If -1, the current number of CPUs will be used.
'''
if self.trained is False:
raise ValueError('the model has not been trained yet')
# Predict the ranking scores for the documents.
predictions = self.predict(queries, n_jobs)
rankings = np.zeros(queries.document_count(), dtype=np.intc)
ranksort_queries(queries.query_indptr, predictions, rankings)
if compact or queries.query_count() == 1:
return rankings
else:
return np.array_split(rankings, queries.query_indptr[1:-1])
def compute_scale(self, queries, relevance_scores=None):
'''
Return the ideal DCG value for each query. Optionally, external
relevance assessments can be used instead of the relevances
present in the queries.
Parameters
----------
queries: Queries
The queries for which the ideal DCG should be computed.
relevance_scores: array of integers, optional, (default is None)
The relevance scores that should be used instead of the
relevance scores inside queries. Note, this argument is
experimental.
'''
ideal_values = np.empty(queries.query_count(), dtype=np.float64)
if relevance_scores is not None:
if queries.document_count() != relevance_scores.shape[0]:
raise ValueError('number of documents and relevance scores do not match')
# Need to sort the relevance labels first.
indices = np.empty(relevance_scores.shape[0], dtype=np.intc)
relevance_argsort_v1(relevance_scores, indices, relevance_scores.shape[0])
# Creates a copy.
relevance_scores = relevance_scores[indices]
else:
# Assuming these are sorted.
relevance_scores = queries.relevance_scores
self.metric_.evaluate_queries_ideal(queries.query_indptr, relevance_scores, ideal_values)
return ideal_values
def evaluate(self, ranking=None, labels=None, ranked_labels=None, scales=None):
'''
Evaluate NDCG metric on the specified ranked list of document relevance scores.
The function input can be either ranked list of relevance labels (`ranked_labels`),
which is most convenient from the computational point of view, or it can be in
the form of ranked list of documents (`ranking`) and corresponding relevance scores
(`labels`), from which the ranked document relevance labels are computed.
Parameters:
-----------
ranking: array, shape = (n_documents,)
Specify list of ranked documents.
labels: array: shape = (n_documents,)
Specify relevance score for each document.
ranked_labels: array, shape = (n_documents,)
Relevance scores of the ranked documents. If not given, then
`ranking` and `labels` must not be None, `ranked_labels` will
be than inferred from them.
scales: float, optional (default is None)
The ideal DCG value on the given documents. If None is given
it will be computed from the document relevance scores.
'''
if ranked_labels is not None:
return self.get_score_from_labels_list(ranked_labels)
elif ranking is not None and labels is not None:
if ranking.shape[0] != labels.shape[0]:
raise ValueError('number of ranked documents != number of relevance labels (%d, %d)' \
% (ranking.shape[0], labels.shape[0]))
ranked_labels = np.array(sorted(labels, key=dict(zip(labels,ranking)).get, reverse=True), dtype=np.intc)
return self.get_score_from_labels_list(ranked_labels)
def _get_partition_indices(start, end, n_jobs):
'''
Get boundary indices for ``n_jobs`` number of sub-arrays dividing
a (contiguous) array of indices starting with ``start`` (inclusive)
and ending with ``end`` (exclusive) into equal parts.
'''
if (end - start) >= n_jobs:
return np.linspace(start, end, n_jobs + 1).astype(np.intc)
else:
return np.arange(end - start + 1, dtype=np.intc)
def save_as_text(self, filepath, shuffle=False):
'''
Save queries into the specified file in svmlight format.
Parameters:
-----------
filepath: string
The filepath where this object will be saved.
shuffle: bool
Specify to shuffle the query document lists prior
to writing into the file.
'''
# Inflate the query_ids array such that each id covers
# the corresponding feature vectors.
query_ids = np.fromiter(
chain(*[[qid] * cnt for qid, cnt in zip(self.query_ids, np.diff(self.query_indptr))]),
dtype=int)
relevance_scores = self.relevance_scores
feature_vectors = self.feature_vectors
if shuffle:
shuffle_indices = np.random.permutation(self.document_count())
reshuffle_indices = np.argsort(query_ids[shuffle_indices])
document_shuffle_indices = np.arange(self.document_count(),
dtype=np.intc)[shuffle_indices[reshuffle_indices]]
query_ids = query_ids[document_shuffle_indices]
relevance_scores = relevance_scores[document_shuffle_indices]
feature_vectors = feature_vectors[document_shuffle_indices]
with open(filepath, 'w') as ofile:
for score, qid, feature_vector in zip(relevance_scores,
query_ids,
feature_vectors):
ofile.write('%d' % score)
ofile.write(' qid:%d' % qid)
for feature in zip(self.feature_indices, feature_vector):
output = ' %d:%.12f' % feature
ofile.write(output.rstrip('0').rstrip('.'))
ofile.write('\n')
def _action(*entries):
return np.array(entries, dtype=np.intc)
def get_idxs_thread(comm, npoints):
""" Get indices for processor using Scatterv
Note:
-----
Uppercase mpi4py functions require everything to be in C-compatible
types or they will return garbage!
"""
size = comm.Get_size()
rank = comm.Get_rank()
npoints_thread = np.zeros(size,dtype=np.intc)
offsets_thread = np.zeros(size,dtype=np.intc)
for idx in range(size):
npoints_thread[idx] = npoints/size
offsets_thread[idx] = sum(npoints_thread[:idx])
for idx in range(npoints % size):
npoints_thread[idx] += 1
offsets_thread[idx + 1:] += 1
npoints_thread = tuple(npoints_thread)
offsets_thread = tuple(offsets_thread)
idxs_thread = np.zeros(npoints_thread[rank],dtype=np.intc)
idxs = np.arange(npoints,dtype=np.intc)
comm.Scatterv((idxs, npoints_thread, offsets_thread, MPI.INT), idxs_thread, root=0)
return idxs_thread, npoints_thread, offsets_thread
def get_ravel_offsets(npoints_thread,natoms):
""" Get lengths and offsets for gathering trajectory fragments """
size = len(npoints_thread)
ravel_lengths = np.zeros(size,dtype=np.intc)
ravel_offsets = np.zeros(size,dtype=np.intc)
for i in range(size):
ravel_lengths[i] = npoints_thread[i]*3*natoms
ravel_offsets[i] = sum(ravel_lengths[:i])
ravel_lengths = tuple(ravel_lengths)
ravel_offsets = tuple(ravel_offsets)
return ravel_lengths, ravel_offsets
def _count_vocab(self, raw_documents, fixed_vocab):
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
"""
if fixed_vocab:
vocabulary = self.vocabulary_
else:
# Add a new value when a new vocabulary item is seen
vocabulary = defaultdict()
vocabulary.default_factory = vocabulary.__len__
analyze = self.build_analyzer()
j_indices = _make_int_array()
indptr = _make_int_array()
indptr.append(0)
for doc in raw_documents:
for feature in analyze(doc):
try:
j_indices.append(vocabulary[feature])
except KeyError:
# Ignore out-of-vocabulary items for fixed_vocab=True
continue
indptr.append(len(j_indices))
if not fixed_vocab:
# disable defaultdict behaviour
vocabulary = dict(vocabulary)
if not vocabulary:
raise ValueError("empty vocabulary; perhaps the documents only"
" contain stop words")
j_indices = frombuffer_empty(j_indices, dtype=np.intc)
indptr = np.frombuffer(indptr, dtype=np.intc)
values = np.ones(len(j_indices))
X = sp.csr_matrix((values, j_indices, indptr),
shape=(len(indptr) - 1, len(vocabulary)),
dtype=self.dtype)
X.sum_duplicates()
return vocabulary, X
def test_dtype(self):
dt = np.intc
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.array([1], dt)))
dt = '<i4'
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.array([1], dt)))
dt = np.dtype('>i4')
p = ndpointer(dtype=dt)
p.from_param(np.array([1], dt))
self.assertRaises(TypeError, p.from_param,
np.array([1], dt.newbyteorder('swap')))
dtnames = ['x', 'y']
dtformats = [np.intc, np.float64]
dtdescr = {'names': dtnames, 'formats': dtformats}
dt = np.dtype(dtdescr)
p = ndpointer(dtype=dt)
self.assertTrue(p.from_param(np.zeros((10,), dt)))
samedt = np.dtype(dtdescr)
p = ndpointer(dtype=samedt)
self.assertTrue(p.from_param(np.zeros((10,), dt)))
dt2 = np.dtype(dtdescr, align=True)
if dt.itemsize != dt2.itemsize:
self.assertRaises(TypeError, p.from_param, np.zeros((10,), dt2))
else:
self.assertTrue(p.from_param(np.zeros((10,), dt2)))
def _count_vocab(self, raw_documents, fixed_vocab):
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
"""
if fixed_vocab:
vocabulary = self.vocabulary_
else:
# Add a new value when a new vocabulary item is seen
vocabulary = defaultdict()
vocabulary.default_factory = vocabulary.__len__
analyze = self.build_analyzer()
j_indices = []
indptr = _make_int_array()
values = _make_int_array()
indptr.append(0)
for doc in raw_documents:
feature_counter = {}
for feature in analyze(doc):
try:
feature_idx = vocabulary[feature]
if feature_idx not in feature_counter:
feature_counter[feature_idx] = 1
else:
feature_counter[feature_idx] += 1
except KeyError:
# Ignore out-of-vocabulary items for fixed_vocab=True
continue
j_indices.extend(feature_counter.keys())
values.extend(feature_counter.values())
indptr.append(len(j_indices))
if not fixed_vocab:
# disable defaultdict behaviour
vocabulary = dict(vocabulary)
if not vocabulary:
raise ValueError("empty vocabulary; perhaps the documents only"
" contain stop words")
j_indices = np.asarray(j_indices, dtype=np.intc)
indptr = np.frombuffer(indptr, dtype=np.intc)
values = frombuffer_empty(values, dtype=np.intc)
X = sp.csr_matrix((values, j_indices, indptr),
shape=(len(indptr) - 1, len(vocabulary)),
dtype=self.dtype)
X.sort_indices()
return vocabulary, X
def _count_vocab_2(self, raw_documents, fixed_vocab):
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
"""
if fixed_vocab:
vocabulary = self.vocabulary_
else:
# Add a new value when a new vocabulary item is seen
vocabulary = defaultdict()
vocabulary.default_factory = vocabulary.__len__
analyze = self.build_analyzer()
j_indices = []
indptr = _make_int_array()
# values = _make_int_array()
values = array.array(str("f"))
indptr.append(0)
for doc in raw_documents:
feature_counter = {}
for feature in analyze(doc):
try:
feature_idx = vocabulary[feature]
if feature_idx not in feature_counter:
feature_counter[feature_idx] = 1
else:
feature_counter[feature_idx] += 1
except KeyError:
# Ignore out-of-vocabulary items for fixed_vocab=True
continue
j_indices.extend(feature_counter.keys())
values.extend([i * 1.0 / sum(feature_counter.values()) for i in feature_counter.values()])
indptr.append(len(j_indices))
if not fixed_vocab:
# disable defaultdict behaviour
vocabulary = dict(vocabulary)
if not vocabulary:
raise ValueError("empty vocabulary; perhaps the documents only"
" contain stop words")
j_indices = np.asarray(j_indices, dtype=np.intc)
indptr = np.frombuffer(indptr, dtype=np.intc)
values = frombuffer_empty(values, dtype=np.float32)
X = sp.csr_matrix((values, j_indices, indptr),
shape=(len(indptr) - 1, len(vocabulary)))
X.sort_indices()
return vocabulary, X