def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD):
"""Reduces the tensor data on multiple GPUs across all machines. Each tensor
in tensor_list should reside on a separate GPU
Only the GPU of tensor_list[0] on the process with rank ``dst`` is
going to receive the final result.
Only nccl backend is currently supported
tensors should only be GPU tensors
Arguments:
tensor_list (List[Tensor]): Input and output GPU tensors of the
collective . The function operates in-place.
dst (int): Destination rank
op (optional): One of the values from ``torch.distributed.reduce_op``
enum. Specifies an operation used for element-wise reductions.
group (optional): Group of the collective.
"""
assert torch.distributed._initialized == _INITIALIZED_PG, \
"collective only supported in process-group mode"
warnings.warn("""
================================================================================
WARNING
================================================================================
reduce__multigpu is still experimental. The API will change without
notice and we're can't guarantee full correctness and expected performance yet.
We'll announce it once it's ready.
""")
return torch._C._dist_reduce_multigpu(tensor_list, dst, op, group)
评论列表
文章目录