Fix #224: prevent number of IB devices from going out of bound

This commit is contained in:
Ke Wen 2019-07-16 08:41:56 -07:00
parent c8c68fb5f7
commit 920ae57c14

View File

@ -112,13 +112,13 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError; if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
for (int d=0; d<nIbDevs; d++) { for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
struct ibv_context * context; struct ibv_context * context;
if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) { if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) {
WARN("NET/IB : Unable to open device %s", devices[d]->name); WARN("NET/IB : Unable to open device %s", devices[d]->name);
continue; continue;
} }
int found = 0; int nPorts = 0;
struct ibv_device_attr devAttr; struct ibv_device_attr devAttr;
memset(&devAttr, 0, sizeof(devAttr)); memset(&devAttr, 0, sizeof(devAttr));
if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) { if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
@ -148,10 +148,10 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
ncclIbDevs[ncclNIbDevs].context = context; ncclIbDevs[ncclNIbDevs].context = context;
strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
ncclNIbDevs++; ncclNIbDevs++;
found++; nPorts++;
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
} }
if (found == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
} }
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; }; if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
} }