def write_genome_gtf(self, out_gtf_fn):
with open(out_gtf_fn, 'wb') as f:
writer = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='')
for genome_prefix, in_gtf_fn in itertools.izip(self.genome_prefixes, self.in_gtf_fns):
if len(self.genomes) > 1:
prefix_func = lambda s: '%s_%s' % (genome_prefix, s)
else:
prefix_func = lambda s: s
transcript_to_chrom = {}
cross_chrom_transcripts = set()
for row, is_comment, properties in self.gtf_reader_iter(in_gtf_fn):
if is_comment:
writer.writerow(row)
continue
chrom = prefix_func(row[0])
row[0] = chrom
if 'transcript_id' in properties:
properties['transcript_id'] = prefix_func(properties['transcript_id'])
curr_tx = properties['transcript_id']
if curr_tx in transcript_to_chrom and transcript_to_chrom[curr_tx] != chrom:
# ignore recurrences of a transcript on different chromosomes - it will break the STAR index
cross_chrom_transcripts.add(curr_tx)
continue
transcript_to_chrom[curr_tx] = chrom
if 'gene_id' in properties:
properties['gene_id'] = prefix_func(properties['gene_id'])
if 'gene_name' in properties:
properties['gene_name'] = prefix_func(properties['gene_name'])
row[8] = self.format_properties_dict(properties)
writer.writerow(row)
print "WARNING: The following transcripts appear on multiple chromosomes in the GTF:"
print '\n'.join(list(cross_chrom_transcripts)) + '\n'
print "This can indicate a problem with the reference or annotations. Only the first chromosome will be counted."
评论列表
文章目录