def run_lasso(X, y, max_iter=3000, cv=5, n_threads=1):
""" Implement LassoCV in sklearn
Args:
X (np.array): scaled X.
y (pd.df): four columns response table.
max_iter (int): max iteration.
cv (int): CV fold.
n_threads (int): Number of threads to use for parallel computing.
Returns:
float: trained alpha value.
"""
logger.info('Implementing LassoCV with {} iter. and {}-fold CV'.format(max_iter, cv))
# generate logit response
y_logit = logit((y.nMut + 0.5) / (y.length * y.N))
# sub-sampling X and y (300,000)
use_ix = np.random.choice(y_logit.shape[0], 300000, replace=False)
Xsub = X[use_ix, :]
ysub = y_logit[use_ix]
reg = LassoCV(max_iter=max_iter, cv=cv, copy_X=False, n_jobs=n_threads)
lassocv = reg.fit(Xsub, ysub)
logger.info('LassoCV alpha = {}'.format(lassocv.alpha_))
return lassocv.alpha_
python类logit()的实例源码
def find_coeffs_bin(self, budget):
k = len(self.classes) # number of classes
assert k == 2
n = self.num_features() # vector dimension
X_train = self.gen_query_set(n, budget)
y = logit(self.query_probas(X_train)[:, 1])
X = np.hstack((X_train, np.ones((budget, 1))))
if budget == n+1:
try:
w_opt = np.linalg.solve(X, y).T
except np.linalg.linalg.LinAlgError:
w_opt = np.linalg.lstsq(X, y)[0].T
else:
w_opt = np.linalg.lstsq(X, y)[0].T
int_opt = w_opt[-1]
w_opt = np.array([w_opt[:-1]])
self.X_train = X_train
return w_opt, int_opt
def predict(st,norm,bounds):
rew = np.log(1+ (st[:,-1:]))
a_x = bounds[0]
b_x = bounds[2]
eps = 1e-5
rew = np.clip(rew,a_x+eps,b_x-eps)
rew = logit((rew - a_x) / (b_x - a_x))
st [:,-1:] = rew
State = np.zeros((1,61))
State[0,:] = np.hstack((st[0,0],st[:,[1,2,3,-1]].ravel()))
X = (State - norm[0]) / norm[1]
return np.round(policy_network(X)[0,:],4)
def test_logistic_lmm():
df = pd.read_csv(os.path.join(get_resource_path(),'sample_data.csv'))
model = Lmer('DV_l ~ IV1+ (IV1|Group)',data=df,family='binomial')
model.fit(summarize=False)
assert model.coefs.shape == (2,13)
estimates = np.array([-0.16098421, 0.00296261])
assert np.allclose(model.coefs['Estimate'],estimates,atol=.001)
assert isinstance(model.fixef,pd.core.frame.DataFrame)
assert model.fixef.shape == (47,2)
assert isinstance(model.ranef,pd.core.frame.DataFrame)
assert model.ranef.shape == (47,2)
assert np.allclose(model.coefs.loc[:,'Estimate'],model.fixef.mean(),atol=.01)
# Test prediction
assert np.allclose(model.predict(model.data,use_rfx=True),model.data.fits)
assert np.allclose(model.predict(model.data,use_rfx=True,pred_type='link'),logit(model.data.fits))
def compute_sgd(data):
logging.info('Computing SGD')
n_splits = 10
folder = StratifiedKFold(n_splits=n_splits, shuffle=True)
for ix_first, ix_second in tqdm_notebook(folder.split(np.zeros(data['y_train'].shape[0]), data['y_train']),
total=n_splits):
# {'en__l1_ratio': 0.0001, 'en__alpha': 1e-05}
model = SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
l1_ratio=0.0001,
alpha=1e-05,
class_weight=None)
model = model.fit(data['X_train'][ix_first, :], data['y_train'][ix_first])
data['y_train_pred'][ix_second] = logit(model.predict_proba(data['X_train'][ix_second, :])[:, 1])
data['y_test_pred'].append(logit(model.predict_proba(data['X_test'])[:, 1]))
data['y_test_pred'] = np.array(data['y_test_pred']).T.mean(axis=1)
return data
def preprocess_feature(self, feature, parameters):
is_not_empty = 1 - np.isclose(feature, normalization.MISSING_VALUE)
if parameters.feature_type == identify_types.BINARY:
# Binary features are always 1 unless they are 0
return ((feature != 0) * is_not_empty).astype(np.float32)
if parameters.boxcox_lambda is not None:
feature = stats.boxcox(
np.maximum(
feature + parameters.boxcox_shift,
normalization.BOX_COX_MARGIN
), parameters.boxcox_lambda
)
# No *= to ensure consistent out-of-place operation.
if parameters.feature_type == identify_types.PROBABILITY:
feature = np.clip(feature, 0.01, 0.99)
feature = special.logit(feature)
elif parameters.feature_type == identify_types.QUANTILE:
quantiles = parameters.quantiles
values = np.zeros(feature.shape)
for quantile in quantiles:
values += feature >= quantile
feature = values / float(len(quantiles))
elif parameters.feature_type == identify_types.ENUM:
possible_values = parameters.possible_values
mapping = {}
for i, possible_value in enumerate(possible_values):
mapping[possible_value] = i
output_feature = np.zeros((len(feature), len(possible_values)))
for i, val in enumerate(feature):
output_feature[i][mapping[val]] = 1.0
return output_feature
else:
feature = feature - parameters.mean
feature /= parameters.stddev
feature *= is_not_empty
return feature
def run_rndlasso(X, y, alpha,
n_resampling=500, sample_fraction=0.1, n_threads=1):
""" Implement Randomized Lasso in sklearn
Args:
X (np.array): scaled X.
y (pd.df): four columns response table.
alpha (float): parameter trained from lassoCV
n_resampling (int): number of times for resampling
sample_fraction (float): fraction of data to use at each resampling
Returns:
np.array: feature importance scores
"""
logger.info('Implementing Randomized Lasso with alpha={}, n_resampling={} and sample_fraction={}'.
format(alpha, n_resampling, sample_fraction))
# generate logit response
y_logit = logit((y.nMut + 0.5) / (y.length * y.N))
reg = RandomizedLasso(alpha=alpha,
n_resampling=n_resampling,
sample_fraction=sample_fraction,
selection_threshold=1e-3,
max_iter=3000,
normalize=False,
n_jobs=n_threads)
rndlasso = reg.fit(X, y_logit)
fi_scores = rndlasso.scores_
return fi_scores
def itransform(self, y_transformed):
yscale = logit(y_transformed)
return (yscale / self.scale)
def munge_scoreses(scoreses, df):
npredictors = len(scoreses)
score_shape = (len(df), npredictors)
scores = np.empty(score_shape, dtype=np.float32)
# Yay, nested loops :/
i = 0
for (uid, pid) in df[ ['uid', 'pid'] ].itertuples(index=False):
for predictor_ix, pdict in enumerate(scoreses):
prob = pdict[uid][pid]
scores[i, predictor_ix] = logit(prob)
i += 1
return scores
def vectorize_fold(fold, tags, meta_df, use_metafeats=True):
with time_me('Loaded pdicts'):
scoreses = [common.pdict_for_tag(tag, fold) for tag in tags]
df = meta_df[meta_df['fold']==fold]
assert len(df)
y = df['label']
n_predictors = len(scoreses)
with time_me('Munged scores for {} predictors'.format(n_predictors), mode='print'):
# TODO: could use the logit loading fn added to user_wrapper module
scores = munge_scoreses(scoreses, df)
if not use_metafeats:
X = scores
else:
meta_cols = metavectorize.metafeature_columns
meta = df[meta_cols].values
# Special f_0 dummy meta feature for learning vanilla weight term per predictor
metafeats = np.hstack([np.ones( (len(df), 1) ), meta])
# Oh fuck this, I've spent too long trying to understand np.einsum...
# (Worth noting that sklearn.preprocessing has a 'PolynomialFeatures' utility
# that might have been useful here. But this is fine.)
n_metafeats = metafeats.shape[1]
logging.info('{} predictors x {} metafeatures -> {} coefs'.format(
n_predictors, n_metafeats, n_predictors*n_metafeats))
# X is 'metafeat major'. i.e. the first n_p values for each vector are the
# raw scores for each predictor, they're followed by each predictor's score
# multiplied by the first metafeature and so on.
X = np.tile(scores, n_metafeats) * np.repeat(metafeats, n_predictors, axis=1)
return X, y
def fit(self, X, s):
_x = np.ones((X.shape[0], X.shape[1] + 1))
_x[:, : - 1] = X
self.w, _, _, _ = np.linalg.lstsq(_x, logit(s))
distribution_util_test.py 文件源码
项目:DeepLearning_VirtualReality_BigData_Project
作者: rashmitripathi
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def testGetLogitsAndProbsLogits(self):
p = np.array([0.01, 0.2, 0.5, 0.7, .99], dtype=np.float32)
logits = special.logit(p)
with self.test_session():
new_logits, new_p = distribution_util.get_logits_and_probs(
logits=logits, validate_args=True)
self.assertAllClose(p, new_p.eval())
self.assertAllClose(logits, new_logits.eval())
distribution_util_test.py 文件源码
项目:DeepLearning_VirtualReality_BigData_Project
作者: rashmitripathi
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def testGetLogitsAndProbsProbability(self):
p = np.array([0.01, 0.2, 0.5, 0.7, .99], dtype=np.float32)
with self.test_session():
new_logits, new_p = distribution_util.get_logits_and_probs(
probs=p, validate_args=True)
self.assertAllClose(special.logit(p), new_logits.eval())
self.assertAllClose(p, new_p.eval())