analyze.py 文件源码-python代码片段

def analyze_r01(state, human, time):
    """
    Iteration 1
    """
    #Read data
    if human:
        DATA_CSV = 'data/'+state+'_2a.csv'
    else:
        DATA_CSV = 'data/'+state+'_2b.csv'
    data_points = pd.read_csv(DATA_CSV)
    #Shift carbon and year one row back
    nr1 = data_points['carb']
    nr1 = nr1.iloc[1:len(nr1)]
    nr2 = data_points['py']
    nr2 = nr2.iloc[1:len(nr2)]
    data_points = data_points.iloc[0:len(data_points.index)-1]
    nr1.index = np.arange(len(nr1.index))
    nr2.index = np.arange(len(nr2.index))
    #Now we can calculate difference in carbon
    if time:
        data_points.loc[:, 'growth'] = (nr1 - data_points['carb']) / (nr2 - data_points['py'])
    else:
        data_points.loc[:, 'growth'] = nr1
    data_points.loc[:, 'post_py'] = nr2
    data_points = data_points[data_points.post_py // 10000 == data_points.py // 10000]
    data_points.drop(['py', 'post_py'], axis=1, inplace=True)
    data_points.index = np.arange(len(data_points.index))
    data_points = data_points.loc[:, ['carb', 'growth']]
    data_points = data_points.as_matrix().tolist()
    #Split data into training and testing
    random.shuffle(data_points)
    training = data_points[0:len(data_points) / 2]
    test = data_points[len(data_points) / 2:len(data_points)]
    training = np.array(training)
    #Create the linear regression function
    m = stats.linregress(training).slope
    n = stats.linregress(training).intercept
    sq_error = 0.0
    #Perform validation
    for elem in test:
        predicted = m * elem[0] + n
        actual = elem[1]
        sq_error += (actual - predicted) ** 2
    mse = math.sqrt(sq_error/len(test))
    return mse