def filtered_table(table,
v_gene_coverage, # at least
j_gene_coverage, # at least
v_gene_evalue, # at most
):
"""
Discard the following rows in the table:
- no J assigned
- stop codon found
- V gene coverage less than v_gene_coverage
- J gene coverage less than j_gene_coverage
- V gene E-value greater than v_gene_evalue
Return the filtered table.
"""
stats = FilteringStatistics()
stats.n = len(table)
# Both V and J must be assigned
# (Note V_gene and J_gene columns use empty strings instead of NA)
filtered = table[(table['V_gene'] != '') & (table['J_gene'] != '')][:]
stats.vjassigned = len(filtered)
filtered['V_gene'] = pd.Categorical(filtered['V_gene'])
# Filter out sequences that have a stop codon
filtered = filtered[filtered.stop == 'no']
stats.stop = len(filtered)
# Filter out sequences with a too low V gene hit E-value
filtered = filtered[filtered.V_evalue <= v_gene_evalue]
stats.v_evalue = len(filtered)
# Filter out sequences with too low V gene coverage
filtered = filtered[filtered.V_covered >= v_gene_coverage]
stats.v_coverage = len(filtered)
# Filter out sequences with too low J gene coverage
filtered = filtered[filtered.J_covered >= j_gene_coverage]
stats.j_coverage = len(filtered)
return filtered, stats
评论列表
文章目录