Fix GPU Direct RDMA detection.
Whether the network supported GPU Direct RDMA or not was ignored, causing sockets to break when cards were local enough that NCCL tried to use it.
This commit is contained in:
parent
b8a9a32ccb
commit
4b39a4cf91
@ -241,7 +241,7 @@ static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGd
|
|||||||
// Finally, check if the NIC supports it
|
// Finally, check if the NIC supports it
|
||||||
int flags;
|
int flags;
|
||||||
NCCLCHECK(ncclNetPtrSupport(dev, &flags));
|
NCCLCHECK(ncclNetPtrSupport(dev, &flags));
|
||||||
if (flags & NCCL_PTR_CUDA == 0) return ncclSuccess;
|
if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
|
||||||
*useGdr = 1;
|
*useGdr = 1;
|
||||||
INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read);
|
INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read);
|
||||||
return ncclSuccess;
|
return ncclSuccess;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user