def main(args):
if args.minimum_frequency is None:
minimum_frequency = max((len(args.tables) + 1) // 2, 2)
else:
minimum_frequency = args.minimum_frequency
logger.info('Minimum frequency set to %s', minimum_frequency)
# Read in tables
tables = []
for path in args.tables:
table = pd.read_csv(path, sep='\t')
table = table[table.database_diff >= args.minimum_db_diff]
table = table.dropna()
tables.append(table)
if len(table) == 0:
logger.warn('Table read from %r is empty after filtering out sequences with database diff >= %s.', path, args.minimum_db_diff)
# Count V sequence occurrences
counter = Counter()
for table in tables:
counter.update(set(table.consensus))
# Find most frequent occurrences and print result
print('count', 'gene', 'database_diff', 'sequence', 'names', sep='\t')
for sequence, frequency in counter.most_common():
if frequency < minimum_frequency:
break
names = []
gene = None
for table in tables:
matching_rows = table[table.consensus == sequence]
if matching_rows.empty:
continue
names.extend(matching_rows.name)
if gene is None:
row = matching_rows.iloc[0]
gene = row.gene
database_diff = row.database_diff
#shm = row['V_SHM']
print(frequency, gene, database_diff, sequence, *names, sep='\t')
评论列表
文章目录