def plot_f_distrib_for_many_coefficients(self, features):
from scipy.stats import f
# Remove a particular subset of features
X = np.delete(self.X, [self.features.index(_) for _ in features], 1)
# Prediction from reduced model
XT = X.T
std_error_matrix = inv(XT.dot(X))
beta = std_error_matrix.dot(XT).dot(self.y)
y_hat = X.dot(beta)
rss_reduced_model = np.sum((self.y - y_hat)**2)
dfn = len(features)
dfd = self.df
# This should be distributed as chi squared
# with degrees of freedom equal to number
# of dropped features
rss_diff = (rss_reduced_model - self.rss)
chi_1 = rss_diff / dfn
chi_2 = self.pop_var
f_score = chi_1 / chi_2
# 5% and 95% percentile
f_05, f_95 = f.ppf([0.05, 0.95], dfn, dfd)
x = np.linspace(0.001, 5.0)
plt.axvline(x=f_05)
plt.axvline(x=f_95)
plt.scatter(f_score, f.pdf(f_score, dfn, dfd), marker='o', color='red')
plt.plot(x, f.pdf(x, dfn, dfd), color='gray', lw=5, alpha=0.6)
plt.title('f-distribtion for dropping features: {0}'.format(features))
plt.show()
评论列表
文章目录