def file_to_subset_setup(request):
ids = [2, 4, 6]
flows = [3.1, -9999.0, 5.0]
date = '2017-04-29_00:00:00'
flows = ma.masked_array(flows, mask=[0, 1, 0]) # explicit mask
with Dataset(_file_to_subset, 'w') as nc:
nc.model_output_valid_time = date
dim = nc.createDimension('feature_id', 3)
id_var = nc.createVariable('feature_id', 'i', ('feature_id',))
id_var[:] = ids
flow_var = nc.createVariable('streamflow', 'f', ('feature_id',),
fill_value=-9999.0)
flow_var[:] = flows
extra_var = nc.createVariable('extra_var', 'i', ('feature_id',))
extra_var[:] = [1, 2, 3]
def file_to_subset_teardown():
os.remove(_file_to_subset)
request.addfinalizer(file_to_subset_teardown)
python类masked_array()的实例源码
def files_to_cube_setup(request):
date_template = '2017-04-29_0{0}:00:00'
for i, nc_file in enumerate(_files_to_cube):
date = date_template.format(i)
flows = [flow * (i + 1) for flow in _flows_template]
if i == 1:
flows[1] = -9999.0 # one way of masking data
elif i == 2:
flows = ma.masked_array(flows, mask=[0, 1, 0]) # explicit mask
with Dataset(nc_file, 'w') as nc:
nc.model_output_valid_time = date
dim = nc.createDimension('feature_id', 3)
id_var = nc.createVariable('feature_id', 'i', ('feature_id',))
id_var[:] = _ids
flow_var = nc.createVariable('streamflow', 'f', ('feature_id',),
fill_value=-9999.0)
flow_var[:] = flows
def files_to_cube_teardown():
for nc_file in _files_to_cube:
os.remove(nc_file)
request.addfinalizer(files_to_cube_teardown)
def file_to_read_streamflow_setup(request):
ids = [2, 4, 6]
flows = [1.3, -9999.0, 5.1]
date = '2017-04-29_04:00:00'
flows = ma.masked_array(flows, mask=[0, 1, 0]) # explicit mask
with Dataset(_file_to_read_streamflow, 'w') as nc:
nc.model_output_valid_time = date
dim = nc.createDimension('feature_id', 3)
id_var = nc.createVariable('feature_id', 'i', ('feature_id',))
id_var[:] = ids
flow_var = nc.createVariable('streamflow', 'f', ('feature_id',),
fill_value=-9999.0)
flow_var[:] = flows
def file_to_read_streamflow_teardown():
os.remove(_file_to_read_streamflow)
request.addfinalizer(file_to_read_streamflow_teardown)
def resample(self):
al, o = np.log(self.alpha_0), self.obs_distn
self.z = ma.masked_array(self.z,mask=np.zeros(self.z.shape))
model = self.model
for n in np.random.permutation(self.data.shape[0]):
# mask out n
self.z.mask[n] = True
# form the scores and sample them
ks = list(model._get_occupied())
scores = np.array([
np.log(model._get_counts(k))+ o.log_predictive(self.data[n],model._get_data_withlabel(k)) \
for k in ks] + [al + o.log_marginal_likelihood(self.data[n])])
idx = sample_discrete_from_log(scores)
if idx == scores.shape[0]-1:
self.z[n] = self._new_label(ks)
else:
self.z[n] = ks[idx]
# sample
# note: the mask gets fixed by assigning into the array
self.z[n] = sample_discrete_from_log(np.array(scores))
def test_record_array_with_object_field():
# Trac #1839
y = ma.masked_array(
[(1, '2'), (3, '4')],
mask=[(0, 0), (0, 1)],
dtype=[('a', int), ('b', np.object)])
# getting an item used to fail
y[1]
def test_record_array_with_object_field():
# Trac #1839
y = ma.masked_array(
[(1, '2'), (3, '4')],
mask=[(0, 0), (0, 1)],
dtype=[('a', int), ('b', np.object)])
# getting an item used to fail
y[1]
def maskoceans(lonsin,latsin,datain,inlands=True,resolution='l',grid=5):
"""
mask data (``datain``), defined on a grid with latitudes ``latsin``
longitudes ``lonsin`` so that points over water will not be plotted.
.. tabularcolumns:: |l|L|
============== ====================================================
Arguments Description
============== ====================================================
lonsin, latsin rank-2 arrays containing longitudes and latitudes of
grid.
datain rank-2 input array on grid defined by ``lonsin`` and
``latsin``.
inlands if False, masked only ocean points and not inland
lakes (Default True).
resolution gshhs coastline resolution used to define land/sea
mask (default 'l', available 'c','l','i','h' or 'f')
grid land/sea mask grid spacing in minutes (Default 5;
10, 2.5 and 1.25 are also available).
============== ====================================================
returns a masked array the same shape as datain with "wet" points masked.
"""
# read in land/sea mask.
lsmask_lons, lsmask_lats, lsmask =\
_readlsmask(lakes=inlands,resolution=resolution,grid=grid)
# nearest-neighbor interpolation to output grid.
lsmasko = interp(lsmask,lsmask_lons,lsmask_lats,lonsin,latsin,masked=True,order=0)
# mask input data.
mask = lsmasko == 0
return ma.masked_array(datain,mask=mask)
def maskoceans(lonsin,latsin,datain,inlands=True,resolution='l',grid=5):
"""
mask data (``datain``), defined on a grid with latitudes ``latsin``
longitudes ``lonsin`` so that points over water will not be plotted.
.. tabularcolumns:: |l|L|
============== ====================================================
Arguments Description
============== ====================================================
lonsin, latsin rank-2 arrays containing longitudes and latitudes of
grid.
datain rank-2 input array on grid defined by ``lonsin`` and
``latsin``.
inlands if False, masked only ocean points and not inland
lakes (Default True).
resolution gshhs coastline resolution used to define land/sea
mask (default 'l', available 'c','l','i','h' or 'f')
grid land/sea mask grid spacing in minutes (Default 5;
10, 2.5 and 1.25 are also available).
============== ====================================================
returns a masked array the same shape as datain with "wet" points masked.
"""
# read in land/sea mask.
lsmask_lons, lsmask_lats, lsmask =\
_readlsmask(lakes=inlands,resolution=resolution,grid=grid)
# nearest-neighbor interpolation to output grid.
lsmasko = interp(lsmask,lsmask_lons,lsmask_lats,lonsin,latsin,masked=True,order=0)
# mask input data.
mask = lsmasko == 0
return ma.masked_array(datain,mask=mask)
test_mrecords.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def test_record_array_with_object_field():
# Trac #1839
y = ma.masked_array(
[(1, '2'), (3, '4')],
mask=[(0, 0), (0, 1)],
dtype=[('a', int), ('b', np.object)])
# getting an item used to fail
y[1]
def predict(self, X, quantile=None):
"""
Predict regression value for X.
Parameters
----------
X : array-like or sparse matrix of shape = [n_samples, n_features]
The input samples. Internally, it will be converted to
``dtype=np.float32`` and if a sparse matrix is provided
to a sparse ``csr_matrix``.
quantile : int, optional
Value ranging from 0 to 100. By default, the mean is returned.
check_input : boolean, (default=True)
Allow to bypass several input checking.
Don't use this parameter unless you know what you do.
Returns
-------
y : array of shape = [n_samples]
If quantile is set to None, then return E(Y | X). Else return
y such that F(Y=y | x) = quantile.
"""
# apply method requires X to be of dtype np.float32
X = check_array(X, dtype=np.float32, accept_sparse="csc")
if quantile is None:
return super(BaseForestQuantileRegressor, self).predict(X)
sorter = np.argsort(self.y_train_)
X_leaves = self.apply(X)
weights = np.zeros((X.shape[0], len(self.y_train_)))
quantiles = np.zeros((X.shape[0]))
for i, x_leaf in enumerate(X_leaves):
mask = self.y_train_leaves_ != np.expand_dims(x_leaf, 1)
x_weights = ma.masked_array(self.y_weights_, mask)
weights = x_weights.sum(axis=0)
quantiles[i] = weighted_percentile(
self.y_train_, quantile, weights, sorter)
return quantiles
MFSideData.py 文件源码
项目:distributed-matrix-factorization-recommender
作者: jennyslu
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def SGD(x):
global n_updates_acc
global mse
for val in x:
row_block_id = val[0]
v_iter = val[1][0]
w_iter = val[1][1]
h_iter = val[1][2]
# dictionaries to store W and H
w = {xw[0]:xw[1] for xw in w_iter}
h = {xh[0]:xh[1] for xh in h_iter}
# go through V and update W and H
for v_ij in v_iter:
i, j = v_ij
# get row and column
w_i = w[i]
h_j = h[j]
# calculate error
error = 5 - np.dot(w_i,h_j)
# increment MSE
mse += error**2
# gradients with L2 loss
# dictionary values are updated in place
h_update = step_size.value*(-2*error*w_i + 2.0*reg.value*h_j)
h_update_mx = ma.masked_array(h_update, mask.value)
w_update = step_size.value*(-2*error*h_j + 2.0*reg.value*w_i)
h_j -= step_size.value*(-2*error*w_i + 2.0*reg.value*h_j)
w_i -= step_size.value*(-2*error*h_j + 2.0*reg.value*w_i)
# increment num updates
n_updates_acc += 1
# must massage results in something that will return properly
output = {}
for row_index in w:
output[('W', row_index)] = (row_index, w[row_index])
for col_index in h:
output[('H', col_index)] = (col_index, h[col_index])
# return iterator of updated W and H
return tuple((output.items()))
def test_record_array_with_object_field():
# Trac #1839
y = ma.masked_array(
[(1, '2'), (3, '4')],
mask=[(0, 0), (0, 1)],
dtype=[('a', int), ('b', np.object)])
# getting an item used to fail
y[1]
def test_extract_overlimit():
""" Thest a request over the limits of the database """
db = WOA()
t = db['sea_water_temperature'].extract(var='t_mn', doy=136.875,
depth=5502, lat=17.5, lon=-37.5)
assert ma.is_masked(t['t_mn'])
t = db['sea_water_temperature'].extract(var='t_mn', doy=136.875,
depth=[10, 5502], lat=17.5, lon=-37.5)
assert np.all(t['t_mn'].mask == [False, True])
assert ma.allclose(t['t_mn'],
ma.masked_array([24.62145996, 0], mask=[False, True]))
def test_record_array_with_object_field():
# Trac #1839
y = ma.masked_array(
[(1, '2'), (3, '4')],
mask=[(0, 0), (0, 1)],
dtype=[('a', int), ('b', np.object)])
# getting an item used to fail
y[1]
def update_data(self):
var = getattr(self._sim, self._variable)[:,0:2]
mask = None
if self._sub_domain:
pos = self._sim.positions
mask_x = np.logical_or(pos[:, 0] <= self._sub_domain[0][0],
pos[:, 0] >= self._sub_domain[0][1])
mask_y = np.logical_or(pos[:, 1] <= self._sub_domain[1][0],
pos[:, 1] >= self._sub_domain[1][1])
mask = np.logical_or(mask_x, mask_y)
if self._particle_type is not None:
if mask is None:
mask = (self._sim.types != self._particle_type)
else:
mask = np.logical_or(mask, (self._sim.types != self._particle_type))
if mask is not None:
tiledmask = np.transpose(np.tile(mask, (2, 1)))
var = ma.masked_array(var, tiledmask)
var = var.compressed()
var = var.reshape([len(var)//2, 2])
hist, self._x_edges, self._y_edges = np.histogram2d(var[:, 0], var[:, 1],
bins=self._nr_of_bins, range=self._hist_range)
if self._window is not None:
self._dataHistory.append(hist)
if len(self._dataHistory) > self._window:
del self._dataHistory[0]
self._histogram_array = sum(self._dataHistory)
else:
self._histogram_array += hist
def test_record_array_with_object_field():
# Trac #1839
y = ma.masked_array(
[(1, '2'), (3, '4')],
mask=[(0, 0), (0, 1)],
dtype=[('a', int), ('b', np.object)])
# getting an item used to fail
y[1]
def test_record_array_with_object_field():
# Trac #1839
y = ma.masked_array(
[(1, '2'), (3, '4')],
mask=[(0, 0), (0, 1)],
dtype=[('a', int), ('b', np.object)])
# getting an item used to fail
y[1]
def _dense_fit(self, X, strategy, missing_values, axis):
"""Fit the transformer on dense data."""
X = check_array(X, force_all_finite=False)
mask = _get_mask(X, missing_values)
masked_X = ma.masked_array(X, mask=mask)
# Mean
if strategy == "mean":
mean_masked = np.ma.mean(masked_X, axis=axis)
# Avoid the warning "Warning: converting a masked element to nan."
mean = np.ma.getdata(mean_masked)
mean[np.ma.getmask(mean_masked)] = np.nan
return mean
# Median
elif strategy == "median":
if tuple(int(v) for v in np.__version__.split('.')[:2]) < (1, 5):
# In old versions of numpy, calling a median on an array
# containing nans returns nan. This is different is
# recent versions of numpy, which we want to mimic
masked_X.mask = np.logical_or(masked_X.mask,
np.isnan(X))
median_masked = np.ma.median(masked_X, axis=axis)
# Avoid the warning "Warning: converting a masked element to nan."
median = np.ma.getdata(median_masked)
median[np.ma.getmaskarray(median_masked)] = np.nan
return median
# Most frequent
elif strategy == "most_frequent":
# scipy.stats.mstats.mode cannot be used because it will no work
# properly if the first element is masked and if it's frequency
# is equal to the frequency of the most frequent valid element
# See https://github.com/scipy/scipy/issues/2636
# To be able access the elements by columns
if axis == 0:
X = X.transpose()
mask = mask.transpose()
most_frequent = np.empty(X.shape[0])
for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
row_mask = np.logical_not(row_mask).astype(np.bool)
row = row[row_mask]
most_frequent[i] = _most_frequent(row, np.nan, 0)
return most_frequent
def file_to_combine_setup(request):
file_pattern = 'combine_me_comids_{0}consistent{1}.nc'
tempdir = tempfile.gettempdir()
consistent_id_order = [join(tempdir, file_pattern.format('', i))
for i in range(3)]
inconsistent_id_order = [join(tempdir, file_pattern.format('in', i))
for i in range(3)]
ids = [2, 4, 6, 8]
flows_template = [3.1, 2.2, 5.0, 7.1]
date_template = '2017-04-29_0{0}:00:00'
for i, nc_file in enumerate(consistent_id_order):
date = date_template.format(i)
flows = [flow * (i + 1) for flow in flows_template]
if i == 1:
flows[1] = -9999.0 # one way of masking data
elif i == 2:
flows = ma.masked_array(flows, mask=[0, 1, 0, 0]) # explicit mask
with Dataset(nc_file, 'w') as nc:
nc.model_output_valid_time = date
dim = nc.createDimension('feature_id', 4)
id_var = nc.createVariable('feature_id', 'i', ('feature_id',))
id_var[:] = ids
flow_var = nc.createVariable('streamflow', 'f', ('feature_id',),
fill_value=-9999.0)
flow_var[:] = flows
nwm_subset.combine_files(consistent_id_order, _ids_in_order_nc)
for i, nc_file in enumerate(inconsistent_id_order):
date = date_template.format(i)
flows = [flow * (i + 1) for flow in flows_template]
if i == 1:
comids = ids[::-1]
flows = flows[::-1]
else:
comids = ids
with Dataset(nc_file, 'w') as nc:
nc.model_output_valid_time = date
dim = nc.createDimension('feature_id', 4)
id_var = nc.createVariable('feature_id', 'i', ('feature_id',))
id_var[:] = comids
flow_var = nc.createVariable('streamflow', 'f', ('feature_id',),
fill_value=-9999.0)
flow_var[:] = flows
nwm_subset.combine_files(inconsistent_id_order, _ids_not_in_order_nc,
river_ids=[2], consistent_id_order=False)
delete_me = consistent_id_order + inconsistent_id_order
for filename in delete_me:
os.remove(filename)
def file_to_combine_teardown():
os.remove(_ids_in_order_nc)
os.remove(_ids_not_in_order_nc)
request.addfinalizer(file_to_combine_teardown)
def _initialize(self, data, alpha=1.0, sigma_w=1, initial_Z=None, initial_W=None, KK=None):
if data is None:
# @debug if data=None !
data = np.zeros((1,1))
if type(data) is not ma.masked_array:
# Ignore Diagonal
data = np.ma.array(data, mask=np.zeros(data.shape))
np.fill_diagonal(data, ma.masked)
self.mask = data.mask
self.symmetric = (data == data.T).all()
self.nnz = len(data.compressed())
super(IBPGibbsSampling, self)._initialize(data, alpha, initial_Z, KK=KK)
self._mean_w = 0
assert(type(sigma_w) is float)
self._sigma_w = sigma_w
self._sigb = 1 # Carreful make overflow in exp of sigmoid !
self._W_prior = np.zeros((1, self._K))
if initial_W != None:
self._W = initial_W
else:
if self.assortativity == 1:
# Identity
self._W = (np.ones((self._K, self._K))*W_diag) * (np.ones((self._K)) + np.eye(self._K)*-2)
elif self.assortativity == 2:
# Bivariate Gaussian
v = 10
x, y = np.mgrid[-v:v:self._K*1j, -v:v:self._K*1j]
xy = np.column_stack([x.flat, y.flat])
mu = np.array([0, 0])
sigma = np.array([1, 1])
covariance = np.array([[v*100,0],[0,v/10]])
theta = np.pi / 4
rot = np.array([[np.cos(theta), -np.sin(theta)],[np.sin(theta), np.cos(theta)]])
covariance = rot.dot(covariance).dot(rot.T)
z = sp.stats.multivariate_normal.pdf(xy, mean=mu, cov=covariance)
z = 400 * z.reshape(x.shape)
self.z_mean = z - np.ones(z.shape)*1
self._W = np.random.normal(self.z_mean, self._sigma_w, (self._K, self._K))
else:
self._W = np.random.normal(self._mean_w, self._sigma_w, (self._K, self._K))
if self.symmetric:
self._W = np.tril(self._W) + np.tril(self._W, -1).T
np.fill_diagonal(self._W, 1)
#self._Z = csr_matrix(self._Z)
#self._Z = lil_matrix(self._Z)
assert(self._W.shape == (self._K, self._K))
def _dense_fit(self, X, strategy, missing_values, axis):
"""Fit the transformer on dense data."""
X = check_array(X, force_all_finite=False)
mask = _get_mask(X, missing_values)
masked_X = ma.masked_array(X, mask=mask)
# Mean
if strategy == "mean":
mean_masked = np.ma.mean(masked_X, axis=axis)
# Avoid the warning "Warning: converting a masked element to nan."
mean = np.ma.getdata(mean_masked)
mean[np.ma.getmask(mean_masked)] = np.nan
return mean
# Median
elif strategy == "median":
if tuple(int(v) for v in np.__version__.split('.')[:2]) < (1, 5):
# In old versions of numpy, calling a median on an array
# containing nans returns nan. This is different is
# recent versions of numpy, which we want to mimic
masked_X.mask = np.logical_or(masked_X.mask,
np.isnan(X))
median_masked = np.ma.median(masked_X, axis=axis)
# Avoid the warning "Warning: converting a masked element to nan."
median = np.ma.getdata(median_masked)
median[np.ma.getmaskarray(median_masked)] = np.nan
return median
# Most frequent
elif strategy == "most_frequent":
# scipy.stats.mstats.mode cannot be used because it will no work
# properly if the first element is masked and if its frequency
# is equal to the frequency of the most frequent valid element
# See https://github.com/scipy/scipy/issues/2636
# To be able access the elements by columns
if axis == 0:
X = X.transpose()
mask = mask.transpose()
most_frequent = np.empty(X.shape[0])
for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
row_mask = np.logical_not(row_mask).astype(np.bool)
row = row[row_mask]
most_frequent[i] = _most_frequent(row, np.nan, 0)
return most_frequent