def _compute_snp_distances(self, df, build):
if build == 36:
hapmap = self._resources.get_hapmap_h36()
else:
hapmap = self._resources.get_hapmap_h37()
for chrom in df['chrom'].unique():
if chrom not in hapmap.keys():
continue
# create a new dataframe from the positions for the current chromosome
temp = pd.DataFrame(df.loc[(df['chrom'] == chrom)]['pos'].values, columns=['pos'])
# merge HapMap for this chrom
temp = temp.append(hapmap[chrom], ignore_index=True)
# sort based on pos
temp = temp.sort_values('pos')
# fill cM rates forward and backward
temp['rate'] = temp['rate'].fillna(method='ffill')
temp['rate'] = temp['rate'].fillna(method='bfill')
# get difference between positions
pos_diffs = np.ediff1d(temp['pos'])
# compute cMs between each pos based on probabilistic recombination rate
# https://www.biostars.org/p/123539/
cMs_match_segment = (temp['rate'] * np.r_[pos_diffs, 0] / 1e6).values
# add back into temp
temp['cMs'] = np.r_[0, cMs_match_segment][:-1]
temp = temp.reset_index()
del temp['index']
# use null `map` values to find locations of SNPs
snp_indices = temp.loc[temp['map'].isnull()].index
# use SNP indices to determine boundaries over which to sum cMs
start_snp_ix = snp_indices + 1
end_snp_ix = np.r_[snp_indices, snp_indices[-1]][1:] + 1
snp_boundaries = np.c_[start_snp_ix, end_snp_ix]
# sum cMs between SNPs to get total cM distance between SNPs
# http://stackoverflow.com/a/7471967
c = np.r_[0, temp['cMs'].cumsum()][snp_boundaries]
cM_from_prev_snp = c[:, 1] - c[:, 0]
# debug
# temp.loc[snp_indices, 'cM_from_prev_snp'] = np.r_[0, cM_from_prev_snp][:-1]
# temp.to_csv('debug.csv')
# add back into df
df.loc[(df['chrom'] == chrom), 'cM_from_prev_snp'] = np.r_[0, cM_from_prev_snp][:-1]
return hapmap, df
评论列表
文章目录