run_count_mutation_analysis.py 文件源码-python代码片段

def extract_kmer_seq(kmer_bedfile, hg19_fa_file, outfile_kmers_seq):
    '''
    open bed file to write the kmer sequence
    ''' 
    with open(outfile_kmers_seq,'w') as of:
        pass

    wf = open(outfile_kmers_seq,'a+')

    # read names and postions from bed file
    records = SeqIO.to_dict(SeqIO.parse(open(hg19_fa_file), 'fasta'))

    with open(kmer_bedfile) as rf:
        for line in rf:
            name,start,stop,gene,val,strand = bedline_split(line)
            long_seq_record = records[name]
            long_seq = long_seq_record.seq
            alphabet = long_seq.alphabet
            short_seq = str(long_seq)[start-1:stop]
            short_seq_record = SeqRecord(Seq(short_seq, alphabet))
            short_seq_record.seq.strip()

            '''
            If the sequence is in reverse strand, rev complement it
            #edit 14-march-2017 (to extract kmers strictly to half open bed format)
            '''
            if strand == "+":
                kmer_seq = short_seq_record.seq
                #kmer created will be one base greater than the kmer size, trim
                kmer_seq = kmer_seq[:1]

            elif strand == "-":
                kmer_seq = short_seq_record.seq.reverse_complement() 
                kmer_seq = kmer_seq[:-1]
            else:
                print("Strand information not found for line:\n {}".format(line))
                quit()

            wf.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(name,start,stop,gene,kmer_seq,strand))

    rf.close()
    wf.close()

    file_chk = check_files(outfile_kmers_seq)
    return(file_chk)