def count_full_text_occurrences(candidates, table_path, other_gene, other_errors, merge, min_count):
# Use only records that have a chance of reaching the required min_count
records = {info.sequence: info for info in candidates if info.max_count >= min_count}
# Count full-text occurrences in the genomic_sequence, circumventing
# inaccurate IgBLAST alignment boundaries
# TODO limit the search to the gene region (especially for D genes)
# Speed up search by looking for most common sequences first
search_order = sorted(records, key=lambda s: records[s].max_count, reverse=True)
cols = [other_gene, 'V_errors', 'J_errors', 'CDR3_nt', 'genomic_sequence']
for chunk in pd.read_csv(table_path, usecols=cols, chunksize=10000, sep='\t'):
chunk = chunk[chunk[other_errors] == 0]
for row in chunk.itertuples():
for needle in search_order:
if needle in row.genomic_sequence:
record = records[needle]
record.count += 1
record.other_genes.add(getattr(row, other_gene))
record.cdr3s.add(row.CDR3_nt)
if merge:
break
return records.values()
评论列表
文章目录