discoverj.py 文件源码-python代码片段

def count_full_text_occurrences(candidates, table_path, other_gene, other_errors, merge, min_count):
    # Use only records that have a chance of reaching the required min_count
    records = {info.sequence: info for info in candidates if info.max_count >= min_count}

    # Count full-text occurrences in the genomic_sequence, circumventing
    # inaccurate IgBLAST alignment boundaries
    # TODO limit the search to the gene region (especially for D genes)
    # Speed up search by looking for most common sequences first
    search_order = sorted(records, key=lambda s: records[s].max_count, reverse=True)
    cols = [other_gene, 'V_errors', 'J_errors', 'CDR3_nt', 'genomic_sequence']
    for chunk in pd.read_csv(table_path, usecols=cols, chunksize=10000, sep='\t'):
        chunk = chunk[chunk[other_errors] == 0]
        for row in chunk.itertuples():
            for needle in search_order:
                if needle in row.genomic_sequence:
                    record = records[needle]
                    record.count += 1
                    record.other_genes.add(getattr(row, other_gene))
                    record.cdr3s.add(row.CDR3_nt)
                    if merge:
                        break
    return records.values()