def analyze_r02(state, human, time):
"""
Revision 2
"""
#Read data
if human:
DATA_CSV = 'data/'+state+'_2a.csv'
else:
DATA_CSV = 'data/'+state+'_2b.csv'
data_points = pd.read_csv(DATA_CSV)
#Shift carbon and year one row back
nr1 = data_points['carb']
nr1 = nr1.iloc[1:len(nr1)]
nr2 = data_points['py']
nr2 = nr2.iloc[1:len(nr2)]
data_points = data_points.iloc[0:len(data_points.index)-1]
nr1.index = np.arange(len(nr1.index))
nr2.index = np.arange(len(nr2.index))
#Now we can calculate difference in carbon
if time:
data_points.loc[:, 'growth'] = (nr1 - data_points['carb']) / (nr2 - data_points['py'])
else:
data_points.loc[:, 'growth'] = nr1
data_points.loc[:, 'post_py'] = nr2
data_points = data_points[data_points.post_py // 10000 == data_points.py // 10000]
data_points.drop(['py', 'post_py'], axis=1, inplace=True)
data_points.index = np.arange(len(data_points.index))
data_points = data_points.loc[:, ['carb', 'growth']]
data_points = data_points.as_matrix().tolist()
#Split data into 10 groups
N_GROUPS = 10
random.shuffle(data_points)
groups = []
prev_cutoff = 0
#Create the model while performing cross-validation
for i in np.arange(N_GROUPS):
next_cutoff = (i + 1) * len(data_points) / N_GROUPS
groups.append(data_points[prev_cutoff:next_cutoff])
prev_cutoff = next_cutoff
sum_mse = 0
for i in np.arange(N_GROUPS):
training = []
test = []
for j, group in enumerate(groups):
if j == i:
test = group
else:
training.extend(group)
training = np.array(training)
m = stats.linregress(training).slope
n = stats.linregress(training).intercept
sq_error = 0.0
for elem in test:
predicted = m * elem[0] + n
actual = elem[1]
sq_error += (actual - predicted) ** 2
mse = math.sqrt(sq_error/len(test))
sum_mse += mse
return sum_mse/N_GROUPS
评论列表
文章目录