def test_alpha(self, returns, benchmark, expected):
observed = self.empyrical.alpha(returns, benchmark)
assert_almost_equal(
observed,
expected,
DECIMAL_PLACES)
if len(returns) == len(benchmark):
# Compare to scipy linregress
returns_arr = returns.values
benchmark_arr = benchmark.values
mask = ~np.isnan(returns_arr) & ~np.isnan(benchmark_arr)
slope, intercept, _, _, _ = stats.linregress(benchmark_arr[mask],
returns_arr[mask])
assert_almost_equal(
observed,
intercept * 252,
DECIMAL_PLACES
)
# Alpha/beta translation tests.
python类isnan()的实例源码
def test_beta(self, returns, benchmark, expected):
observed = self.empyrical.beta(returns, benchmark)
assert_almost_equal(
observed,
expected,
DECIMAL_PLACES)
if len(returns) == len(benchmark):
# Compare to scipy linregress
returns_arr = returns.values
benchmark_arr = benchmark.values
mask = ~np.isnan(returns_arr) & ~np.isnan(benchmark_arr)
slope, intercept, _, _, _ = stats.linregress(benchmark_arr[mask],
returns_arr[mask])
assert_almost_equal(
observed,
slope
)
def strategy(data, params):
"""
Stack overlapping intervals.
Assumes that each set has the same horizontal position
"""
vjust = params['vjust']
y = data['y'].copy()
y[np.isnan(y)] = 0
heights = np.append(0, y.cumsum())
if params['fill']:
heights = heights / np.abs(heights[-1])
data['ymin'] = np.min([heights[:-1], heights[1:]], axis=0)
data['ymax'] = np.max([heights[:-1], heights[1:]], axis=0)
# less intuitive than (ymin + vjust(ymax-ymin)), but
# this way avoids subtracting numbers of potentially
# similar precision
data['y'] = ((1-vjust)*data['ymin'] + vjust*data['ymax'])
return data
def _find_index(bg_df, start_date, end_date, make_col_bool):
if (make_col_bool): bg_df['date'] = bg_df['created_at'].apply(lambda x: x.date()) #create column with just the date if make_col_bool is True
#Find the first date with the start date (first entry) and the last date with the end date (last entry)
#Since the older dates have higher indices, we use max() for start and min() for the end dates
start_index = bg_df[bg_df['date'] == start_date.date()].index.max()
end_index = bg_df[bg_df['date'] == end_date.date()].index.min()
#Raises exception if invalid dates (which are labeled as NaN)
if np.isnan(start_index): raise Exception("Invalid start date: " + str(start_date.date()))
if np.isnan(end_index): raise Exception("Invalid end date: " + str(end_date.date()))
return bg_df, start_index, end_index
#Function to get the bg data
def plot_heatmaps(data, mis, column_label, cont, topk=30, prefix=''):
cmap = sns.cubehelix_palette(as_cmap=True, light=.9)
m, nv = mis.shape
for j in range(m):
inds = np.argsort(- mis[j, :])[:topk]
if len(inds) >= 2:
plt.clf()
order = np.argsort(cont[:,j])
subdata = data[:, inds][order].T
subdata -= np.nanmean(subdata, axis=1, keepdims=True)
subdata /= np.nanstd(subdata, axis=1, keepdims=True)
columns = [column_label[i] for i in inds]
sns.heatmap(subdata, vmin=-3, vmax=3, cmap=cmap, yticklabels=columns, xticklabels=False, mask=np.isnan(subdata))
filename = '{}/heatmaps/group_num={}.png'.format(prefix, j)
if not os.path.exists(os.path.dirname(filename)):
os.makedirs(os.path.dirname(filename))
plt.title("Latent factor {}".format(j))
plt.yticks(rotation=0)
plt.savefig(filename, bbox_inches='tight')
plt.close('all')
#plot_rels(data[:, inds], map(lambda q: column_label[q], inds), colors=cont[:, j],
# outfile=prefix + '/relationships/group_num=' + str(j), latent=labels[:, j], alpha=0.1)
def write_data(self, result_dict):
for key, result in six.iteritems(result_dict):
if ss.isspmatrix(result):
if np.isnan(result.data).any():
raise ValueError("data {} have nan".format(key))
elif np.isnan(result).any():
raise ValueError("data {} have nan".format(key))
with SimpleTimer("Writing generated data {} to hdf5 file"
.format(key),
end_in_new_line=False):
if key in self.h5f:
# self.h5f[key][...] = result
raise NotImplementedError("Overwriting not supported.")
else:
if (isinstance(result, ss.csc_matrix)
or isinstance(result, ss.csr_matrix)):
# sparse matrix
h5sparse.Group(self.h5f).create_dataset(key,
data=result)
else:
self.h5f.create_dataset(key, data=result)
self.h5f.flush()
def repeat_until_convergence(labelled_data, labelled_clusters, unlabelled_centroids):
#find best fitting centroids to the labelled_data
previous_max_difference = 0
while True:
unlabelled_old_centroids = unlabelled_centroids
unlabelled_centroids = move_centroids(labelled_clusters)
labelled_clusters = form_clusters(labelled_data, unlabelled_centroids)
differences = list(map(lambda a, b: np.linalg.norm(a-b),unlabelled_old_centroids,unlabelled_centroids))
max_difference = max(differences)
if np.isnan(max_difference-previous_max_difference):
difference_change = np.nan
else:
difference_change = abs((max_difference-previous_max_difference)/np.mean([previous_max_difference,max_difference])) * 100
previous_max_difference = max_difference
# difference change is nan once the list of differences is all zeroes.
if np.isnan(difference_change):
break
return labelled_clusters, unlabelled_centroids
def loadData (self, filename, verbose=True, replace_missing=True):
''' Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse'''
if verbose: print("========= Reading " + filename)
start = time.time()
if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")):
with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file:
vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"))
return pickle.load(pickle_file)
if 'format' not in self.info.keys():
self.getFormatData(filename)
if 'feat_num' not in self.info.keys():
self.getNbrFeatures(filename)
data_func = {'dense':data_io.data, 'sparse':data_io.data_sparse, 'sparse_binary':data_io.data_binary_sparse}
data = data_func[self.info['format']](filename, self.info['feat_num'])
# INPORTANT: when we replace missing values we double the number of variables
if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,data)):
vprint (verbose, "Replace missing values by 0 (slow, sorry)")
data = data_converter.replace_missing(data)
if self.use_pickle:
with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file:
vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"))
p = pickle.Pickler(pickle_file)
p.fast = True
p.dump(data)
end = time.time()
if verbose: print( "[+] Success in %5.2f sec" % (end - start))
return data
def sanitize_array(array):
''' Replace NaN and Inf (there should not be any!)'''
a=np.ravel(array)
maxi = np.nanmax((filter(lambda x: x != float('inf'), a))) # Max except NaN and Inf
mini = np.nanmin((filter(lambda x: x != float('-inf'), a))) # Mini except NaN and Inf
array[array==float('inf')]=maxi
array[array==float('-inf')]=mini
mid = (maxi + mini)/2
array[np.isnan(array)]=mid
return array
def htmt(self):
htmt_ = pd.DataFrame(pd.DataFrame.corr(self.data_),
index=self.manifests, columns=self.manifests)
mean = []
allBlocks = []
for i in range(self.lenlatent):
block_ = self.Variables['measurement'][
self.Variables['latent'] == self.latent[i]]
allBlocks.append(list(block_.values))
block = htmt_.ix[block_, block_]
mean_ = (block - np.diag(np.diag(block))).values
mean_[mean_ == 0] = np.nan
mean.append(np.nanmean(mean_))
comb = [[k, j] for k in range(self.lenlatent)
for j in range(self.lenlatent)]
comb_ = [(np.sqrt(mean[comb[i][1]] * mean[comb[i][0]]))
for i in range(self.lenlatent ** 2)]
comb__ = []
for i in range(self.lenlatent ** 2):
block = (htmt_.ix[allBlocks[comb[i][1]],
allBlocks[comb[i][0]]]).values
# block[block == 1] = np.nan
comb__.append(np.nanmean(block))
htmt__ = np.divide(comb__, comb_)
where_are_NaNs = np.isnan(htmt__)
htmt__[where_are_NaNs] = 0
htmt = pd.DataFrame(np.tril(htmt__.reshape(
(self.lenlatent, self.lenlatent)), k=-1), index=self.latent, columns=self.latent)
return htmt
def get_cubic_root(self):
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
# where x = sqrt(mu).
# We substitute x, which is sqrt(mu), with x = y + 1.
# It gives y^3 + py = q
# where p = (D^2 h_min^2)/(2*C) and q = -p.
# We use the Vieta's substution to compute the root.
# There is only one real solution y (which is in [0, 1] ).
# http://mathworld.wolfram.com/VietasSubstitution.html
# eps in the numerator is to prevent momentum = 1 in case of zero gradient
if np.isnan(self._dist_to_opt) or np.isnan(self._h_min) or np.isnan(self._grad_var) \
or np.isinf(self._dist_to_opt) or np.isinf(self._h_min) or np.isinf(self._grad_var):
logging.warning("Input to cubic solver has invalid nan/inf value!")
raise Exception("Input to cubic solver has invalid nan/inf value!")
p = (self._dist_to_opt + eps)**2 * (self._h_min + eps)**2 / 2 / (self._grad_var + eps)
w3 = (-math.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
w = math.copysign(1.0, w3) * math.pow(math.fabs(w3), 1.0/3.0)
y = w - p / 3.0 / (w + eps)
x = y + 1
if self._verbose:
logging.debug("p %f, denominator %f", p, self._grad_var + eps)
logging.debug("w3 %f ", w3)
logging.debug("y %f, denominator %f", y, w + eps)
if np.isnan(x) or np.isinf(x):
logging.warning("Output from cubic is invalid nan/inf value!")
raise Exception("Output from cubic is invalid nan/inf value!")
return x
def treegauss_add_row(
data_row,
tree_grid,
program,
latent_row,
vert_ss,
edge_ss,
feat_ss, ):
# Sample latent state using dynamic programming.
TODO('https://github.com/posterior/treecat/issues/26')
# Update sufficient statistics.
for v in range(latent_row.shape[0]):
z = latent_row[v, :]
vert_ss[v, :, :] += np.outer(z, z)
for e in range(tree_grid.shape[1]):
z1 = latent_row[tree_grid[1, e], :]
z2 = latent_row[tree_grid[2, e], :]
edge_ss[e, :, :] += np.outer(z1, z2)
for v, x in enumerate(data_row):
if np.isnan(x):
continue
z = latent_row[v, :]
feat_ss[v] += 1
feat_ss[v, 1] += x
feat_ss[v, 2:] += x * z # TODO Use central covariance.
def imputeSNPs(X):
snpsMean = np.nanmean(X, axis=0)
isNan = np.isnan(X)
for i,m in enumerate(snpsMean): X[isNan[:,i], i] = m
return X
def __call__(self, *args, **kwargs):
assert len(args) <= len(self.inputs), "Too many arguments provided"
feed_dict = {}
# Update the args
for inpt, value in zip(self.inputs, args):
self._feed_input(feed_dict, inpt, value)
# Update the kwargs
kwargs_passed_inpt_names = set()
for inpt in self.inputs[len(args):]:
inpt_name = inpt.name.split(':')[0]
inpt_name = inpt_name.split('/')[-1]
assert inpt_name not in kwargs_passed_inpt_names, \
"this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
if inpt_name in kwargs:
kwargs_passed_inpt_names.add(inpt_name)
self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
else:
assert inpt in self.givens, "Missing argument " + inpt_name
assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
# Update feed dict with givens.
for inpt in self.givens:
feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
if self.check_nan:
if any(np.isnan(r).any() for r in results):
raise RuntimeError("Nan detected")
return results
def test_gradients(self):
inputs = tf.random_normal(
[self.batch_size, self.sequence_length, self.input_depth])
seq_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length
labels = np.random.randint(0, self.vocab_size,
[self.batch_size, self.sequence_length])
helper = decode_helper.TrainingHelper(
inputs=inputs, sequence_length=seq_length)
decoder_fn = self.create_decoder(
helper=helper, mode=tf.contrib.learn.ModeKeys.TRAIN)
initial_state = decoder_fn.cell.zero_state(
self.batch_size, dtype=tf.float32)
decoder_output, _ = decoder_fn(initial_state, helper)
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=decoder_output.logits, labels=labels)
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
grads_and_vars = optimizer.compute_gradients(tf.reduce_mean(losses))
#pylint: disable=E1101
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
grads_and_vars_ = sess.run(grads_and_vars)
for grad, _ in grads_and_vars_:
self.assertFalse(np.isnan(grad).any())
return grads_and_vars_
def frame_to_series(self, field, frame, columns=None):
"""
Convert a frame with a DatetimeIndex and sid columns into a series with
a sid index, using the aggregator defined by the given field.
"""
if isinstance(frame, pd.DataFrame):
columns = frame.columns
frame = frame.values
if not len(frame):
return pd.Series(
data=(0 if field == 'volume' else np.nan),
index=columns,
).values
if field in ['price', 'close']:
# shortcircuit for full last row
vals = frame[-1]
if np.all(~np.isnan(vals)):
return vals
return ffill(frame)[-1]
elif field == 'open':
return bfill(frame)[0]
elif field == 'volume':
return np.nansum(frame, axis=0)
elif field == 'high':
return np.nanmax(frame, axis=0)
elif field == 'low':
return np.nanmin(frame, axis=0)
else:
raise ValueError("Unknown field {}".format(field))
def update_last_known_values(self):
"""
Store the non-NaN values from our oldest frame in each frequency.
"""
ffillable = self.ffillable_fields
if not len(ffillable):
return
for frequency in self.unique_frequencies:
digest_panel = self.digest_panels.get(frequency, None)
if digest_panel:
oldest_known_values = digest_panel.oldest_frame(raw=True)
else:
oldest_known_values = self.buffer_panel.oldest_frame(raw=True)
oldest_vals = oldest_known_values
oldest_columns = self.fields
for field in ffillable:
f_idx = oldest_columns.get_loc(field)
field_vals = oldest_vals[f_idx]
# isnan would be fast, possible to use?
non_nan_sids = np.where(pd.notnull(field_vals))
key = (frequency.freq_str, field)
key_loc = self.last_known_prior_values.index.get_loc(key)
self.last_known_prior_values.values[
key_loc, non_nan_sids
] = field_vals[non_nan_sids]
def check_entry(key, value):
if key != 'period_label':
return np.isnan(value) or np.isinf(value)
else:
return False
############################
# Risk Metric Calculations #
############################
def _compute_asset_lifetimes(self):
"""
Compute and cache a recarry of asset lifetimes.
"""
equities_cols = self.equities.c
buf = np.array(
tuple(
sa.select((
equities_cols.sid,
equities_cols.start_date,
equities_cols.end_date,
)).execute(),
), dtype='<f8', # use doubles so we get NaNs
)
lifetimes = np.recarray(
buf=buf,
shape=(len(buf),),
dtype=[
('sid', '<f8'),
('start', '<f8'),
('end', '<f8')
],
)
start = lifetimes.start
end = lifetimes.end
start[np.isnan(start)] = 0 # convert missing starts to 0
end[np.isnan(end)] = np.iinfo(int).max # convert missing end to INTMAX
# Cast the results back down to int.
return lifetimes.astype([
('sid', '<i8'),
('start', '<i8'),
('end', '<i8'),
])
def _compute(self, arrays, dates, assets, mask):
data = arrays[0]
bins = self.params['bins']
to_bin = where(mask, data, nan)
result = quantiles(to_bin, bins)
# Write self.missing_value into nan locations, whether they were
# generated by our input mask or not.
result[isnan(result)] = self.missing_value
return result.astype(int64_dtype)