def calculatepRCA(data, y ='',c='',p='',x=''):
'''
Returns the pRCA from data. pRCA is the probability that (RCA_{y+1} > 1) given the volume of exports (x_{cpy}),
and the 'baseline term' (\sum_c x_{cpy} \sum_p x_{cpy} / \sum_c \sum_p x_{cpy}).
It is computed using k-nearest neighbors, in the space of log exports and log baseline term.
Parameters
----------
data : pandas.DataFrame
Raw data. It has source,target,volume (trade, number of people etc.).
y,c,p,x : str (optional)
Labels of the columns in data used for source,target,volume
Returns
-------
RCA : pandas.DataFrame
Table with the RCAs, with the columns c,p,x,RCA
If shares is True it also includes:
s_c : Share of X_cp over X_c
s_p : Share of X_cp over X_p
'''
df = calculateRCA_by_year(data,y ='year',c='ccode',p='pcode',x='x',log_terms = True)
#Compute (RCA > 1) next year and merge it
df_ = df.copy()
df_['year'] = df_['year'] - 1
df_['RCA_y+1'] = (df_['log(RCA)'] > 0).astype(int)
df_ = df_[['year','ccode','pcode','RCA_y+1']]
df = df.merge(df_)
#Prepare dataset for knn and fit
M = df[['log(x)','T','RCA_y+1']].as_matrix()
X, y = M[:,:2], M[:, 2]
knn = neighbors.KNeighborsRegressor(n_neighbors = 200, weights = 'uniform').fit(X, y)
#To avoid memory error, compute predictions in split X. Predictions are output pRCA
pRCA = np.array([])
for x in np.array_split(X, 10):
pRCA = np.append(pRCA, knn.predict(x))
df['pRCA'] = pRCA
return df
评论列表
文章目录