def get_total_conf_mapped_reads_in_cells(filename, filtered_barcodes, mem_gb):
""" Number of confidently mapped reads w/ valid, filtered barcodes.
Because this is called from a 'split' function, we must stay within the given mem limit.
NOTE: We re-open the file for each chunk IN ISOLATED PROCESSES
due to a possible memory leak in h5py. Tests show the mem usage is nondeterministic, too.
https://github.com/h5py/h5py/issues/763 (among many others)
Args: filtered_barcodes (set) - set of barcode strings (e.g., ACGT-1)
filename (str) - path to molecule info HDF5 file
mem_gb (int) - limit memory usage to this value """
filtered_bcs_set = set(MoleculeCounter.get_compressed_bc_iter(filtered_barcodes))
entries_per_chunk = int(np.floor(float(mem_gb*1e9)) / MoleculeCounter.get_record_bytes())
print 'Entries per chunk: %d' % entries_per_chunk
with MoleculeCounter.open(filename, 'r') as mc:
num_entries = mc.nrows()
total_mapped_reads = 0
for start in xrange(0, num_entries, entries_per_chunk):
queue = multiprocessing.Queue()
p = multiprocessing.Process(target=MoleculeCounter.get_total_conf_mapped_reads_in_cells_chunk,
args=(filename, filtered_bcs_set, start, entries_per_chunk, queue))
p.start()
p.join()
total_mapped_reads += queue.get()
return total_mapped_reads
评论列表
文章目录