def parse_contigs(outputAssembly, threshold_float, verbose):
"""
Parses the output from iAssembler, to a single FASTA file
"""
if verbose:
sys.stderr.write('Executing: Parse assembled consensus\n')
fname = outputAssembly + 'contig_member'
fname_exists = os.path.isfile(fname)
if fname_exists:
# part all the files in the tmp assembly folder
contigInfo = open(outputAssembly + 'contig_member', 'r')
contigs = {}
total_reads = 0
for line in contigInfo:
line = line.strip()
line = line.split('\t')
# count the reads for each assembled Unitig
read_number = len(line) - 1
for element in line:
if 'evm' in element:
contigs[line[0]] = [read_number, element]
if line[0] not in contigs:
contigs[line[0]] = [read_number]
# sum all the reads whitin one cluster
total_reads += read_number
contigInfo.close()
# calculate the number of reads considering the threshold
threshold = total_reads * float(threshold_float)
global count_sequences
real_contigs = {}
count_unitigs = 1
for key, element in list(contigs.items()):
# to retrieve only supported assembly
if len(element) == 2:
if element[0] >= threshold and 'evm' in element[1]:
real_contigs[key] = element[1] + ' ' + str(
element[0]) + '_above_threshold_' + str(threshold) + ' loc_' + str(count_sequences)
elif element[0] < threshold and 'evm' in element[1]:
real_contigs[key] = element[1] + ' ' + str(
element[0]) + '_below_threshold_' + str(threshold) + ' loc_' + str(count_sequences)
elif len(element) == 1:
if element[0] >= threshold:
real_contigs[key] = 'Unitig' + str(count_sequences) + '_' + str(count_unitigs) + ' ' + str(
element[0]) + '_above_threshold_' + str(threshold) + ' loc_' + str(count_sequences)
count_unitigs += 1
# writes the outputs
fileAssembly = outputAssembly + 'unigene_seq.new.fasta'
contigSeq = open(fileAssembly, 'r')
contigDict = SeqIO.to_dict(SeqIO.parse(contigSeq, 'fasta'))
output_filename = outputAssembly[:-1] + '_assembled.fasta'
outputFile = open(output_filename, 'w')
for iden, write_iden in list(real_contigs.items()):
if iden in contigDict:
outputFile.write('>' + write_iden + '\n' + str(contigDict[iden].seq) + '\n')
contigSeq.close()
outputFile.close()
return
评论列表
文章目录