From 4b39a4cf9113e01d80377236020de418a0381358 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 4 Dec 2018 14:42:28 -0800 Subject: [PATCH] Fix GPU Direct RDMA detection. Whether the network supported GPU Direct RDMA or not was ignored, causing sockets to break when cards were local enough that NCCL tried to use it. --- src/transport/net.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transport/net.cu b/src/transport/net.cu index 165187c..9c366b3 100644 --- a/src/transport/net.cu +++ b/src/transport/net.cu @@ -241,7 +241,7 @@ static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGd // Finally, check if the NIC supports it int flags; NCCLCHECK(ncclNetPtrSupport(dev, &flags)); - if (flags & NCCL_PTR_CUDA == 0) return ncclSuccess; + if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess; *useGdr = 1; INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read); return ncclSuccess;