def crossGenotyper(args):
## Get the VCF file (filtered may be) generated by GATK.
## inputs:
# 1) VCF file
# 2) Parent1 and Parent2
# 3) SNP matrix (hdf5 file)
# 4) Bin length, default as 200Kbp
# 5) Chromosome length
log.info("loading genotype data for parents")
if args['father'] is not None:
log.info("input files: %s and %s" % (args['parents'], args['father']))
if not os.path.isfile(args['parents']) and os.path.isfile(args['father']):
die("either of the input files do not exists, please provide VCF/BED file for parent genotype information")
(p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = snpmatch.parseInput(inFile = args['parents'], logDebug = args['logDebug'])
(p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = snpmatch.parseInput(inFile = args['father'], logDebug = args['logDebug'])
commonCHRs_ids = np.union1d(p1snpCHR, p2snpCHR)
commonSNPsCHR = np.zeros(0, dtype=commonCHRs_ids.dtype)
commonSNPsPOS = np.zeros(0, dtype=int)
snpsP1 = np.zeros(0, dtype='int8')
snpsP2 = np.zeros(0, dtype='int8')
for i in commonCHRs_ids:
perchrP1inds = np.where(p1snpCHR == i)[0]
perchrP2inds = np.where(p2snpCHR == i)[0]
perchrPositions = np.union1d(p1snpPOS[perchrP1inds], p2snpPOS[perchrP2inds])
commonSNPsCHR = np.append(commonSNPsCHR, np.repeat(i, len(perchrPositions)))
commonSNPsPOS = np.append(commonSNPsPOS, perchrPositions)
perchrsnpsP1 = np.repeat(-1, len(perchrPositions)).astype('int8')
perchrsnpsP2 = np.repeat(-1, len(perchrPositions)).astype('int8')
perchrsnpsP1_inds = np.where(np.in1d(p1snpPOS[perchrP1inds], perchrPositions))[0]
perchrsnpsP2_inds = np.where(np.in1d(p2snpPOS[perchrP2inds], perchrPositions))[0]
snpsP1 = np.append(snpsP1, snpmatch.parseGT(p1snpGT[perchrsnpsP1_inds]))
snpsP2 = np.append(snpsP2, snpmatch.parseGT(p2snpGT[perchrsnpsP2_inds]))
log.info("done!")
else:
parents = args['parents']
## need to filter the SNPs present in C and M
if not args['hdf5accFile']:
snpmatch.die("needed a HDF5 genotype file and not specified")
log.info("loading HDF5 file")
g_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
## die if either parents are not in the dataset
#import ipdb; ipdb.set_trace()
try:
indP1 = np.where(g_acc.accessions == parents.split("x")[0])[0][0]
indP2 = np.where(g_acc.accessions == parents.split("x")[1])[0][0]
except:
snpmatch.die("parents are not in the dataset")
snpsP1 = g_acc.snps[:,indP1]
snpsP2 = g_acc.snps[:,indP2]
commonSNPsCHR = np.array(g_acc.chromosomes)
commonSNPsPOS = np.array(g_acc.positions)
log.info("done!")
log.info("running cross genotyper")
crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, args['inFile'], args['binLen'], args['outFile'], args['logDebug'])
评论列表
文章目录