csmatch.py 文件源码-python代码片段

def crossGenotyper(args):
    ## Get the VCF file (filtered may be) generated by GATK.
    ## inputs:
    # 1) VCF file
    # 2) Parent1 and Parent2
    # 3) SNP matrix (hdf5 file)
    # 4) Bin length, default as 200Kbp
    # 5) Chromosome length
    log.info("loading genotype data for parents")
    if args['father'] is not None:
        log.info("input files: %s and %s" % (args['parents'], args['father']))
        if not os.path.isfile(args['parents']) and os.path.isfile(args['father']):
            die("either of the input files do not exists, please provide VCF/BED file for parent genotype information")
        (p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = snpmatch.parseInput(inFile = args['parents'], logDebug = args['logDebug'])
        (p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = snpmatch.parseInput(inFile = args['father'], logDebug = args['logDebug'])
        commonCHRs_ids = np.union1d(p1snpCHR, p2snpCHR)
        commonSNPsCHR = np.zeros(0, dtype=commonCHRs_ids.dtype)
        commonSNPsPOS = np.zeros(0, dtype=int)
        snpsP1 = np.zeros(0, dtype='int8')
        snpsP2 = np.zeros(0, dtype='int8')
        for i in commonCHRs_ids:
            perchrP1inds = np.where(p1snpCHR == i)[0]
            perchrP2inds = np.where(p2snpCHR == i)[0]
            perchrPositions = np.union1d(p1snpPOS[perchrP1inds], p2snpPOS[perchrP2inds])
            commonSNPsCHR = np.append(commonSNPsCHR, np.repeat(i, len(perchrPositions)))
            commonSNPsPOS = np.append(commonSNPsPOS, perchrPositions)
            perchrsnpsP1 = np.repeat(-1, len(perchrPositions)).astype('int8')
            perchrsnpsP2 = np.repeat(-1, len(perchrPositions)).astype('int8')
            perchrsnpsP1_inds = np.where(np.in1d(p1snpPOS[perchrP1inds], perchrPositions))[0]
            perchrsnpsP2_inds = np.where(np.in1d(p2snpPOS[perchrP2inds], perchrPositions))[0]
            snpsP1 = np.append(snpsP1, snpmatch.parseGT(p1snpGT[perchrsnpsP1_inds]))
            snpsP2 = np.append(snpsP2, snpmatch.parseGT(p2snpGT[perchrsnpsP2_inds]))
        log.info("done!")
    else:
        parents = args['parents']
        ## need to filter the SNPs present in C and M
        if not args['hdf5accFile']:
            snpmatch.die("needed a HDF5 genotype file and not specified")
        log.info("loading HDF5 file")
        g_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
        ## die if either parents are not in the dataset
        #import ipdb; ipdb.set_trace()
        try:
            indP1 = np.where(g_acc.accessions == parents.split("x")[0])[0][0]
            indP2 = np.where(g_acc.accessions == parents.split("x")[1])[0][0]
        except:
            snpmatch.die("parents are not in the dataset")
        snpsP1 = g_acc.snps[:,indP1]
        snpsP2 = g_acc.snps[:,indP2]
        commonSNPsCHR = np.array(g_acc.chromosomes)
        commonSNPsPOS = np.array(g_acc.positions)
        log.info("done!")
    log.info("running cross genotyper")
    crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, args['inFile'], args['binLen'], args['outFile'], args['logDebug'])