reference.py 文件源码-python代码片段

def write_genome_gtf(self, out_gtf_fn):
        with open(out_gtf_fn, 'wb') as f:
            writer = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='')
            for genome_prefix, in_gtf_fn in itertools.izip(self.genome_prefixes, self.in_gtf_fns):
                if len(self.genomes) > 1:
                    prefix_func = lambda s: '%s_%s' % (genome_prefix, s)
                else:
                    prefix_func = lambda s: s

                transcript_to_chrom = {}
                cross_chrom_transcripts = set()
                for row, is_comment, properties in self.gtf_reader_iter(in_gtf_fn):
                    if is_comment:
                        writer.writerow(row)
                        continue

                    chrom = prefix_func(row[0])
                    row[0] = chrom

                    if 'transcript_id' in properties:
                        properties['transcript_id'] = prefix_func(properties['transcript_id'])
                        curr_tx = properties['transcript_id']
                        if curr_tx in transcript_to_chrom and transcript_to_chrom[curr_tx] != chrom:
                            # ignore recurrences of a transcript on different chromosomes - it will break the STAR index
                            cross_chrom_transcripts.add(curr_tx)
                            continue
                        transcript_to_chrom[curr_tx] = chrom
                    if 'gene_id' in properties:
                        properties['gene_id'] = prefix_func(properties['gene_id'])
                    if 'gene_name' in properties:
                        properties['gene_name'] = prefix_func(properties['gene_name'])

                    row[8] = self.format_properties_dict(properties)

                    writer.writerow(row)
                print "WARNING: The following transcripts appear on multiple chromosomes in the GTF:"
                print '\n'.join(list(cross_chrom_transcripts)) + '\n'
                print "This can indicate a problem with the reference or annotations. Only the first chromosome will be counted."