def replace_missing(X):
# This is ugly, but
try:
if X.getformat()=='csr':
return X
except:
X[np.isnan(X)]=-999.0 #djajetic 05.09.2015
return X #djajetic 05.09.2015
p=len(X)
nn=len(X[0])*2
XX = np.zeros([p,nn])
for i in range(len(X)):
line = X[i]
line1 = [0 if np.isnan(x) else x for x in line]
line2 = [1 if np.isnan(x) else 0 for x in line] # indicator of missingness
XX[i] = line1 + line2
return XX
python类isnan()的实例源码
def rhoA(self):
# rhoA
rhoA = pd.DataFrame(0, index=np.arange(1), columns=self.latent)
for i in range(self.lenlatent):
weights = pd.DataFrame(self.outer_weights[self.latent[i]])
weights = weights[(weights.T != 0).any()]
result = pd.DataFrame.dot(weights.T, weights)
result_ = pd.DataFrame.dot(weights, weights.T)
S = self.data_[self.Variables['measurement'][
self.Variables['latent'] == self.latent[i]]]
S = pd.DataFrame.dot(S.T, S) / S.shape[0]
numerador = (
np.dot(np.dot(weights.T, (S - np.diag(np.diag(S)))), weights))
denominador = (
(np.dot(np.dot(weights.T, (result_ - np.diag(np.diag(result_)))), weights)))
rhoA_ = ((result)**2) * (numerador / denominador)
if(np.isnan(rhoA_.values)):
rhoA[self.latent[i]] = 1
else:
rhoA[self.latent[i]] = rhoA_.values
return rhoA.T
def get(self, X):
X = np.array(X)
X_nan = np.isnan(X)
imputed = self.meanImput(X.copy())
if len(self.estimators_) > 1:
for i, estimator_ in enumerate(self.estimators_):
X_s = np.delete(imputed, i, 1)
y_nan = X_nan[:, i]
X_unk = X_s[y_nan]
result_ = []
if len(X_unk) > 0:
for unk in X_unk:
result_.append(estimator_.predict(unk))
X[y_nan, i] = result_
return X
def treegauss_remove_row(
data_row,
tree_grid,
latent_row,
vert_ss,
edge_ss,
feat_ss, ):
# Update sufficient statistics.
for v in range(latent_row.shape[0]):
z = latent_row[v, :]
vert_ss[v, :, :] -= np.outer(z, z)
for e in range(tree_grid.shape[1]):
z1 = latent_row[tree_grid[1, e], :]
z2 = latent_row[tree_grid[2, e], :]
edge_ss[e, :, :] -= np.outer(z1, z2)
for v, x in enumerate(data_row):
if np.isnan(x):
continue
z = latent_row[v, :]
feat_ss[v] -= 1
feat_ss[v, 1] -= x
feat_ss[v, 2:] -= x * z # TODO Use central covariance.
def test_train(self):
model, fetches_ = self._test_pipeline(tf.contrib.learn.ModeKeys.TRAIN)
predictions_, loss_, _ = fetches_
target_len = self.sequence_length + 10 + 2
max_decode_length = model.params["target.max_seq_len"]
expected_decode_len = np.minimum(target_len, max_decode_length)
np.testing.assert_array_equal(predictions_["logits"].shape, [
self.batch_size, expected_decode_len - 1,
model.target_vocab_info.total_size
])
np.testing.assert_array_equal(predictions_["losses"].shape,
[self.batch_size, expected_decode_len - 1])
np.testing.assert_array_equal(predictions_["predicted_ids"].shape,
[self.batch_size, expected_decode_len - 1])
self.assertFalse(np.isnan(loss_))
def information_ratio(algorithm_returns, benchmark_returns):
"""
http://en.wikipedia.org/wiki/Information_ratio
Args:
algorithm_returns (np.array-like):
All returns during algorithm lifetime.
benchmark_returns (np.array-like):
All benchmark returns during algo lifetime.
Returns:
float. Information ratio.
"""
relative_returns = algorithm_returns - benchmark_returns
relative_deviation = relative_returns.std(ddof=1)
if zp_math.tolerant_equals(relative_deviation, 0) or \
np.isnan(relative_deviation):
return 0.0
return np.mean(relative_returns) / relative_deviation
def raw_data_gen(self):
for dt, series in self.data.iterrows():
for sid, price in series.iteritems():
# Skip SIDs that can not be forward filled
if np.isnan(price) and \
sid not in self.started_sids:
continue
self.started_sids.add(sid)
event = {
'dt': dt,
'sid': sid,
'price': price,
# Just chose something large
# if no volume available.
'volume': 1e9,
}
yield event
def test_nan_filter_dataframe(self):
dates = pd.date_range('1/1/2000', periods=2, freq='B', tz='UTC')
df = pd.DataFrame(np.random.randn(2, 2),
index=dates,
columns=[4, 5])
# should be filtered
df.loc[dates[0], 4] = np.nan
# should not be filtered, should have been ffilled
df.loc[dates[1], 5] = np.nan
source = DataFrameSource(df)
event = next(source)
self.assertEqual(5, event.sid)
event = next(source)
self.assertEqual(4, event.sid)
event = next(source)
self.assertEqual(5, event.sid)
self.assertFalse(np.isnan(event.price))
def df_type_to_str(i):
'''
Convert into simple datatypes from pandas/numpy types
'''
if isinstance(i, np.bool_):
return bool(i)
if isinstance(i, np.int_):
return int(i)
if isinstance(i, np.float):
if np.isnan(i):
return 'NaN'
elif np.isinf(i):
return str(i)
return float(i)
if isinstance(i, np.uint):
return int(i)
if type(i) == bytes:
return i.decode('UTF-8')
if isinstance(i, (tuple, list)):
return str(i)
if i is pd.NaT: # not identified as a float null
return 'NaN'
return str(i)
def calc_reward(self, action=0, state=None, **kw ):
"""Calculate the reward for the specified transition."""
eps1, eps2 = self.eps_values_for_actions[action]
if state is None:
state = self.observe()
if self.logspace:
T1, T2, T1s, T2s, V, E = 10**state
else:
T1, T2, T1s, T2s, V, E = state
# the reward function penalizes treatment because of side-effects
reward = -0.1*V - 2e4*eps1**2 - 2e3*eps2**2 + 1e3*E
# Constrain reward to be within specified range
if np.isnan(reward):
reward = -self.reward_bound
elif reward > self.reward_bound:
reward = self.reward_bound
elif reward < -self.reward_bound:
reward = -self.reward_bound
return reward
def to_rgb(img):
"""
Converts the given array into a RGB image. If the number of channels is not
3 the array is tiled such that it has 3 channels. Finally, the values are
rescaled to [0,255)
:param img: the array to convert [nx, ny, channels]
:returns img: the rgb image [nx, ny, 3]
"""
img = np.atleast_3d(img)
channels = img.shape[2]
if channels < 3:
img = np.tile(img, 3)
img[np.isnan(img)] = 0
img -= np.amin(img)
img /= np.amax(img)
img *= 255
return img
def SMA(Series, N, M=1):
ret = []
i = 1
length = len(Series)
# ??X????? nan ?
while i < length:
if np.isnan(Series[i]):
i += 1
else:
break
preY = Series[i] # Y'
ret.append(preY)
while i < length:
Y = (M * Series[i] + (N - M) * preY) / float(N)
ret.append(Y)
preY = Y
i += 1
return pd.Series(ret)
def map(self, data):
data = data[self.fieldName]
colors = np.empty((len(data), 4))
default = np.array(fn.colorTuple(self['Default'])) / 255.
colors[:] = default
for v in self.param('Values'):
mask = data == v.maskValue
c = np.array(fn.colorTuple(v.value())) / 255.
colors[mask] = c
#scaled = np.clip((data-self['Min']) / (self['Max']-self['Min']), 0, 1)
#cmap = self.value()
#colors = cmap.map(scaled, mode='float')
#mask = np.isnan(data) | np.isinf(data)
#nanColor = self['NaN']
#nanColor = (nanColor.red()/255., nanColor.green()/255., nanColor.blue()/255., nanColor.alpha()/255.)
#colors[mask] = nanColor
return colors
def map(self, data):
data = data[self.fieldName]
colors = np.empty((len(data), 4))
default = np.array(fn.colorTuple(self['Default'])) / 255.
colors[:] = default
for v in self.param('Values'):
mask = data == v.maskValue
c = np.array(fn.colorTuple(v.value())) / 255.
colors[mask] = c
#scaled = np.clip((data-self['Min']) / (self['Max']-self['Min']), 0, 1)
#cmap = self.value()
#colors = cmap.map(scaled, mode='float')
#mask = np.isnan(data) | np.isinf(data)
#nanColor = self['NaN']
#nanColor = (nanColor.red()/255., nanColor.green()/255., nanColor.blue()/255., nanColor.alpha()/255.)
#colors[mask] = nanColor
return colors
def round_solution_pool(pool, constraints):
pool.distinct().sort()
P = pool.P
L0_reg_ind = np.isnan(constraints['coef_set'].C_0j)
L0_max = constraints['L0_max']
rounded_pool = SolutionPool(P)
for solution in pool.solutions:
# sort from largest to smallest coefficients
feature_order = np.argsort([-abs(x) for x in solution])
rounded_solution = np.zeros(shape=(1, P))
l0_norm_count = 0
for k in range(0, P):
j = feature_order[k]
if not L0_reg_ind[j]:
rounded_solution[0, j] = np.round(solution[j], 0)
elif l0_norm_count < L0_max:
rounded_solution[0, j] = np.round(solution[j], 0)
l0_norm_count += L0_reg_ind[j]
rounded_pool.add(objvals=np.nan, solutions=rounded_solution)
rounded_pool.distinct().sort()
return rounded_pool
def checkFSXvalsAgainstADNIMERGE(tadpoleDF, mriADNI1FileFSX, otherSSvisCodeStr, ssNameTag,
ignoreMissingCols = False):
nrRows, nrCols = tadpoleDF.shape
colListOtherSS = list(ssDF.columns.values)
colListTadpoleDF = list(tadpoleDF.columns.values)
tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]] = \
tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]].apply(pd.to_numeric, errors='coerce')
tadpoleDF['HIPPOSUM'] = tadpoleDF['ST29SV%s' % ssNameTag] + tadpoleDF['ST88SV%s' % ssNameTag]
for r in range(nrRows):
valsNan = np.isnan(tadpoleDF['Hippocampus'][r]) or (np.isnan(tadpoleDF['ST29SV%s' % ssNameTag][r]) and \
np.isnan(tadpoleDF['ST88SV%s' % ssNameTag][r]))
if valsNan:
continue
valsNotEq = tadpoleDF['Hippocampus'][r] != (tadpoleDF['ST29SV%s' % ssNameTag][r] + tadpoleDF['ST88SV%s' % ssNameTag][r])
if valsNotEq:
print('entries dont match\n ', tadpoleDF[['RID','VISCODE', 'Hippocampus', 'ST29SV%s' % ssNameTag,\
'ST88SV%s' % ssNameTag, 'HIPPOSUM']].iloc[r])
# Conclusion: the reason why entries above don't match is because UCSFFSX has duplicate entries for the same subject and viscode.
def test_hz():
"""Test the hz function."""
df, _ = readSC()
for (teff, logg, mass) in df.loc[:, ['teff', 'logg', 'mass']].values:
lum = (teff / 5777)**4 * (mass / ((10**logg) / (10**4.44)))**2
assert isinstance(hz(teff, lum, model=2), float)
assert isinstance(hz(teff, lum, model=4), float)
teff = 5777
lum = 1
invalids = [{teff: lum}, [teff, lum], (teff, lum), "..."]
for model in range(1, 6):
assert isinstance(hz(teff, lum, model), float)
results = [0.75, 0.98, 0.99, 1.71, 1.77]
for model, result in enumerate(results, start=1):
assert round(hz(teff, lum, model), 2) == result
for invalid in invalids:
assert np.isnan(hz(invalid, lum, model))
assert np.isnan(hz(teff, invalid, model))
assert hz(teff, lum, 2) < hz(teff, lum, 4) # hz1 < hz2
def generateWekaFile(X,Y,features,path,name):
f = open(path + name + '.arff', 'w')
f.write("@relation '" + name + "'\n\n")
for feat in features:
f.write("@attribute " + feat + " numeric\n")
f.write("@attribute cluster {True,False}\n\n")
f.write("@data\n\n")
for i in range(X.shape[0]):
for j in range(X.shape[1]):
if np.isnan(X[i,j]):
f.write("?,")
else:
f.write(str(X[i,j]) + ",")
if Y[i] == 1.0 or Y[i] == True:
f.write("True\n")
else:
f.write("False\n")
f.close()
def test_posterior_zeros(self):
p = np.asarray([.5, 0., 0.]).reshape((1, 3))
posterior = self.eval(self.posterior, p)
print 'posterior', posterior
posterior_grad = self.eval(self.posterior_grad, p)
print 'posterior grad', posterior_grad
kl = self.eval(self.posterior_kl, p)
print kl
self.assertGreater(kl.sum(), 0)
self.assertFalse(np.isnan(kl).any())
self.assertTrue(np.isfinite(kl).all())
grad = self.eval(self.posterior_kl_grad, p)
print grad
self.assertFalse(np.isnan(grad).any())
self.assertTrue(np.isfinite(grad).all())
def update_summary(
var_up,
var,
start,
end,
):
diff = np.abs(var_up - var)
reldiff = diff / var
# filter out nan's
try:
reldiff = reldiff[~np.isnan(reldiff)]
except:
pass
return (np.mean(diff), np.std(diff), np.mean(reldiff),
np.std(reldiff), (end - start).microseconds)
def test_bootstrap_replicate_1d(data, seed):
np.random.seed(seed)
x = dcst.bootstrap_replicate_1d(data, np.mean)
np.random.seed(seed)
x_correct = original.bootstrap_replicate_1d(data[~np.isnan(data)], np.mean)
assert (np.isnan(x) and np.isnan(x_correct, atol=atol, equal_nan=True)) \
or np.isclose(x, x_correct, atol=atol, equal_nan=True)
np.random.seed(seed)
x = dcst.bootstrap_replicate_1d(data, np.median)
np.random.seed(seed)
x_correct = original.bootstrap_replicate_1d(data[~np.isnan(data)], np.median)
assert (np.isnan(x) and np.isnan(x_correct, atol=atol, equal_nan=True)) \
or np.isclose(x, x_correct, atol=atol, equal_nan=True)
np.random.seed(seed)
x = dcst.bootstrap_replicate_1d(data, np.std)
np.random.seed(seed)
x_correct = original.bootstrap_replicate_1d(data[~np.isnan(data)], np.std)
assert (np.isnan(x) and np.isnan(x_correct, atol=atol, equal_nan=True)) \
or np.isclose(x, x_correct, atol=atol, equal_nan=True)
def nan_helper(y):
"""
Helper to handle indices and logical indices of NaNs.
Input:
- y, 1d numpy array with possible NaNs
Output:
- nans, logical indices of NaNs
- index, a function, with signature indices= index(logical_indices),
to convert logical indices of NaNs to 'equivalent' indices
Example:
>>> # linear interpolation of NaNs
>>> nans, x= nan_helper(y)
>>> y[nans]= NP.interp(x(nans), x(~nans), y[~nans])
"""
# Source: http://stackoverflow.com/questions/6518811/interpolate-nan-values-in-a-numpy-array
return NP.isnan(y), lambda z: z.nonzero()[0]
def step4():
key_vec = pickle.loads(open("key_vec.pkl", "rb").read())
vecs = []
for ev, vec in enumerate(key_vec.values()):
x = np.array(vec)
if np.isnan(x).any():
# print(vec)
continue
vecs.append(x)
vecs = np.array(vecs)
kmeans = KMeans(n_clusters=128, init='k-means++', n_init=10, max_iter=300,
tol=0.0001,precompute_distances='auto', verbose=0,
random_state=None, copy_x=True, n_jobs=1)
print("now fitting...")
kmeans.fit(vecs)
open("kmeans.model", "wb").write( pickle.dumps(kmeans) )
for p in kmeans.predict(vecs):
print(p)
def _step5(arr):
kmeans = pickle.loads(open("kmeans.model", "rb").read())
key, lines, tipe = arr
print(key)
open("./tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe,key=key), "w").write("\n".join(lines))
res = os.popen("./fasttext print-sentence-vectors ./models/model.bin < tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe, key=key)).read()
w = open("tmp/tmp.{tipe}.{key}.json".format(tipe=tipe,key=key), "w")
for line in res.split("\n"):
try:
vec = list(map(float, line.split()[-100:]))
except:
print(line)
print(res)
continue
x = np.array(vec)
if np.isnan(x).any():
continue
cluster = kmeans.predict([vec])
txt = line.split()[:-100]
obj = {"txt": txt, "cluster": cluster.tolist()}
data = json.dumps(obj, ensure_ascii=False)
w.write( data + "\n" )
def test_lm(self):
hps = get_test_hparams()
with tf.variable_scope("model"):
model = LM(hps)
with self.test_session() as sess:
tf.initialize_all_variables().run()
tf.initialize_local_variables().run()
loss = 1e5
for i in range(50):
x, y, w = simple_data_generator(hps.batch_size, hps.num_steps)
loss, _ = sess.run([model.loss, model.train_op], {model.x: x, model.y: y, model.w: w})
print("%d: %.3f %.3f" % (i, loss, np.exp(loss)))
if np.isnan(loss):
print("NaN detected")
break
self.assertLess(loss, 1.0)
def get_series_median_peryear(word_time_series, i_year_words, one_minus=False, start_year=1900, end_year=2000, year_inc=10, exclude_partial_missing=False):
"""
Return the mean and stderr arrays for the values of the words specified per year in i_year_words for specified years
"""
medians = []
r_word_time_series = {}
if exclude_partial_missing:
for word, time_series in word_time_series.iteritems():
if not np.isnan(np.sum(time_series.values())):
r_word_time_series[word] = time_series
else:
r_word_time_series = word_time_series
for year in xrange(start_year, end_year + 1, year_inc):
word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year]
if word in r_word_time_series and not np.isnan(r_word_time_series[word][year]) and not r_word_time_series[word][year] == 0])
if len(word_array) == 0:
continue
if one_minus:
word_array = 1 - word_array
medians.append(np.median(word_array))
return np.array(medians)
def get_series_mean_std_peryear(word_time_series, i_year_words, one_minus=False, start_year=1900, end_year=2000, year_inc=1, exclude_partial_missing=False):
"""
Return the mean and stderr arrays for the values of the words specified per year in i_year_words for specified years
"""
means = []
stderrs = []
r_word_time_series = {}
if exclude_partial_missing:
for word, time_series in word_time_series.iteritems():
if not np.isnan(np.sum(time_series.values())):
r_word_time_series[word] = time_series
else:
r_word_time_series = word_time_series
for year in xrange(start_year, end_year + 1, year_inc):
word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year]
if word in r_word_time_series and not np.isnan(r_word_time_series[word][year]) and not np.isinf(r_word_time_series[word][year])])
if len(word_array) == 0:
continue
if one_minus:
word_array = 1 - word_array
means.append(word_array.mean())
stderrs.append(word_array.std())
return np.array(means), np.array(stderrs)
def get_series_mean_stderr_peryear(word_time_series, i_year_words, one_minus=False, start_year=1900, end_year=2000, year_inc=1, exclude_partial_missing=False):
"""
Return the mean and stderr arrays for the values of the words specified per year in i_year_words for specified years
"""
means = []
stderrs = []
r_word_time_series = {}
if exclude_partial_missing:
for word, time_series in word_time_series.iteritems():
time_series = {year:val for year, val in time_series.iteritems() if year >= start_year and year <= end_year}
if not np.isnan(np.sum(time_series.values())):
r_word_time_series[word] = time_series
else:
r_word_time_series = word_time_series
for year in xrange(start_year, end_year + 1, year_inc):
word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year]
if word in r_word_time_series and not np.isnan(r_word_time_series[word][year])])
if one_minus:
word_array = 1 - word_array
means.append(word_array.mean())
stderrs.append(word_array.std() / len(word_array))
return np.array(means), np.array(stderrs)
def get_yearly_set_dev(series, i_year_words, one_minus=False, start_year=1900, end_year=2000, method='diff'):
"""
Gets the mean relative deviation of the words in words vs. the full series.
"""
base_mat = _make_series_mat(series, series.keys(), one_minus=one_minus, start_year=start_year, end_year=end_year)
means = []
stderrs = []
r_word_time_series = series
for year in xrange(start_year, end_year + 1):
word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year]
if word in r_word_time_series and not np.isnan(r_word_time_series[word][year])])
if one_minus:
word_array = 1 - word_array
if method == 'diff':
word_array = word_array - base_mat.mean(0)[year-start_year]
elif method == 'ratio':
word_array = word_array / base_mat.mean(0)[year-start_year]
else:
raise RuntimeError("Unknown deviation method. Use diff or ratio.")
means.append(word_array.mean())
stderrs.append(word_array.std() / len(word_array))
return np.array(means), np.array(stderrs)
def log_likelihood(self, data):
nks = np.bincount(self.labels_, minlength=self.n_clusters) # number of points in each cluster
n, d = data.shape
log_likelihood = 0
covar_matrices = self.covariances(self.labels_, cluster_centers=self.cluster_centers_, data=data)
covar_matrix_det_v = np.linalg.det(covar_matrices)
self._inv_covar_matrices = self._matrix_inverses(covar_matrices)
for k, nk in enumerate(nks):
if self.verbose == 1:
print('log_likelihood: covar_matrix_det = {}'.format(covar_matrix_det_v[k]))
term_1 = nk * (np.log(float(nk)/n) - 0.5 * d * np.log(2*np.pi) - 0.5 * np.log(abs(covar_matrix_det_v[k])))
cdist_result = cdist(data[self.labels_ == k], np.array([self.cluster_centers_[k]]), metric='mahalanobis', VI=self._inv_covar_matrices[k])
cdist_no_nan = cdist_result[~np.isnan(cdist_result)] # to deal with nans returned by cdist
term_2 = -0.5 * (np.sum(cdist_no_nan))
k_sum = term_1 + term_2
log_likelihood += k_sum
if np.isnan(log_likelihood) or log_likelihood == float('inf'):
raise Exception('ll is nan or inf')
return log_likelihood