Skip to content

vllm.distributed.device_communicators.pynccl_wrapper

__all__ module-attribute

__all__ = [
    "NCCLLibrary",
    "ncclDataTypeEnum",
    "ncclRedOpTypeEnum",
    "ncclUniqueId",
    "ncclComm_t",
    "cudaStream_t",
    "buffer_type",
]

buffer_type module-attribute

buffer_type = c_void_p

cudaStream_t module-attribute

cudaStream_t = c_void_p

logger module-attribute

logger = init_logger(__name__)

ncclComm_t module-attribute

ncclComm_t = c_void_p

ncclDataType_t module-attribute

ncclDataType_t = c_int

ncclRedOp_t module-attribute

ncclRedOp_t = c_int

ncclResult_t module-attribute

ncclResult_t = c_int

ncclWindow_t module-attribute

ncclWindow_t = c_void_p

Function dataclass

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
@dataclass
class Function:
    name: str
    restype: Any
    argtypes: list[Any]

argtypes instance-attribute

argtypes: list[Any]

name instance-attribute

name: str

restype instance-attribute

restype: Any

__init__

__init__(
    name: str, restype: Any, argtypes: list[Any]
) -> None

NCCLLibrary

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
class NCCLLibrary:
    exported_functions = [
        # const char* ncclGetErrorString(ncclResult_t result)
        Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
        # ncclResult_t  ncclGetVersion(int *version);
        Function("ncclGetVersion", ncclResult_t, [ctypes.POINTER(ctypes.c_int)]),
        # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
        Function("ncclGetUniqueId", ncclResult_t, [ctypes.POINTER(ncclUniqueId)]),
        # ncclResult_t  ncclCommInitRank(
        #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
        # note that ncclComm_t is a pointer type, so the first argument
        # is a pointer to a pointer
        Function(
            "ncclCommInitRank",
            ncclResult_t,
            [ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId, ctypes.c_int],
        ),
        # ncclResult_t  ncclAllReduce(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
        #   cudaStream_t stream);
        # note that cudaStream_t is a pointer type, so the last argument
        # is a pointer
        Function(
            "ncclAllReduce",
            ncclResult_t,
            [
                buffer_type,
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ncclRedOp_t,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t  ncclReduce(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, ncclRedOp_t op, int root,
        #   ncclComm_t comm,  cudaStream_t stream);
        # note that cudaStream_t is a pointer type, so the last argument
        # is a pointer
        Function(
            "ncclReduce",
            ncclResult_t,
            [
                buffer_type,
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ncclRedOp_t,
                ctypes.c_int,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t  ncclAllGather(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, ncclComm_t comm,
        #   cudaStream_t stream);
        # note that cudaStream_t is a pointer type, so the last argument
        # is a pointer
        Function(
            "ncclAllGather",
            ncclResult_t,
            [
                buffer_type,
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t  ncclReduceScatter(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
        #   cudaStream_t stream);
        # note that cudaStream_t is a pointer type, so the last argument
        # is a pointer
        Function(
            "ncclReduceScatter",
            ncclResult_t,
            [
                buffer_type,
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ncclRedOp_t,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t  ncclSend(
        #   const void* sendbuff, size_t count, ncclDataType_t datatype,
        #   int dest, ncclComm_t comm, cudaStream_t stream);
        Function(
            "ncclSend",
            ncclResult_t,
            [
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ctypes.c_int,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t  ncclRecv(
        #   void* recvbuff, size_t count, ncclDataType_t datatype,
        #   int src, ncclComm_t comm, cudaStream_t stream);
        Function(
            "ncclRecv",
            ncclResult_t,
            [
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ctypes.c_int,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t ncclBroadcast(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, int root, ncclComm_t comm,
        #   cudaStream_t stream);
        Function(
            "ncclBroadcast",
            ncclResult_t,
            [
                buffer_type,
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ctypes.c_int,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # be cautious! this is a collective call, it will block until all
        # processes in the communicator have called this function.
        # because Python object destruction can happen in random order,
        # it is better not to call it at all.
        # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
        Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
        # ncclResult_t ncclGroupStart();
        Function("ncclGroupStart", ncclResult_t, []),
        # ncclResult_t ncclGroupEnd();
        Function("ncclGroupEnd", ncclResult_t, []),
        # ncclResult_t ncclCommWindowRegister(
        #   ncclComm_t comm, void* buff, size_t size,
        #   ncclWindow_t* win, int winFlags);
        Function(
            "ncclCommWindowRegister",
            ncclResult_t,
            [
                ncclComm_t,
                buffer_type,
                ctypes.c_size_t,
                ctypes.POINTER(ncclWindow_t),
                ctypes.c_int,
            ],
        ),
        # ncclResult_t ncclCommWindowDeregister(
        #   ncclComm_t comm, ncclWindow_t win);
        Function("ncclCommWindowDeregister", ncclResult_t, [ncclComm_t, ncclWindow_t]),
    ]

    # class attribute to store the mapping from the path to the library
    # to avoid loading the same library multiple times
    path_to_library_cache: dict[str, Any] = {}

    # class attribute to store the mapping from library path
    #  to the corresponding dictionary
    path_to_dict_mapping: dict[str, dict[str, Any]] = {}

    def __init__(self, so_file: Optional[str] = None):
        so_file = so_file or find_nccl_library()

        try:
            if so_file not in NCCLLibrary.path_to_dict_mapping:
                lib = ctypes.CDLL(so_file)
                NCCLLibrary.path_to_library_cache[so_file] = lib
            self.lib = NCCLLibrary.path_to_library_cache[so_file]
        except Exception as e:
            logger.error(
                "Failed to load NCCL library from %s. "
                "It is expected if you are not running on NVIDIA/AMD GPUs."
                "Otherwise, the nccl library might not exist, be corrupted "
                "or it does not support the current platform %s. "
                "If you already have the library, please set the "
                "environment variable VLLM_NCCL_SO_PATH"
                " to point to the correct nccl library path.",
                so_file,
                platform.platform(),
            )
            raise e

        if so_file not in NCCLLibrary.path_to_dict_mapping:
            _funcs: dict[str, Any] = {}
            for func in NCCLLibrary.exported_functions:
                try:
                    f = getattr(self.lib, func.name)
                    f.restype = func.restype
                    f.argtypes = func.argtypes
                    _funcs[func.name] = f
                except AttributeError:
                    if func.name in [
                        "ncclCommWindowRegister",
                        "ncclCommWindowDeregister",
                    ]:
                        if envs.VLLM_USE_NCCL_SYMM_MEM:
                            logger.warning_once(
                                "The symbol %s is not found in the NCCL "
                                "library %s. To enable VLLM_USE_NCCL_SYMM_MEM "
                                " please update your NCCL version to >= "
                                "2.27.03.",
                                func.name,
                                so_file,
                            )
                        if current_platform.is_rocm():
                            # Having an exception here on ROCm platform is
                            # not allowed during graph capturing
                            continue
                    raise
            NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
        self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]

    def ncclGetErrorString(self, result: ncclResult_t) -> str:
        return self._funcs["ncclGetErrorString"](result).decode("utf-8")

    def NCCL_CHECK(self, result: ncclResult_t) -> None:
        if result != 0:
            error_str = self.ncclGetErrorString(result)
            raise RuntimeError(f"NCCL error: {error_str}")

    def ncclGetRawVersion(self) -> int:
        version = ctypes.c_int()
        self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
        # something like 21903
        return version.value

    def ncclGetVersion(self) -> str:
        version_str = str(self.ncclGetRawVersion())
        # something like 21903 --> "2.19.3"
        major = version_str[0].lstrip("0")
        minor = version_str[1:3].lstrip("0")
        patch = version_str[3:].lstrip("0")
        return f"{major}.{minor}.{patch}"

    def ncclGetUniqueId(self) -> ncclUniqueId:
        unique_id = ncclUniqueId()
        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](ctypes.byref(unique_id)))
        return unique_id

    def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId:
        if len(data) != 128:
            raise ValueError(
                f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes"
            )
        unique_id = ncclUniqueId()
        ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128)
        return unique_id

    def ncclCommInitRank(
        self, world_size: int, unique_id: ncclUniqueId, rank: int
    ) -> ncclComm_t:
        comm = ncclComm_t()
        self.NCCL_CHECK(
            self._funcs["ncclCommInitRank"](
                ctypes.byref(comm), world_size, unique_id, rank
            )
        )
        return comm

    def ncclAllReduce(
        self,
        sendbuff: buffer_type,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        op: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        # `datatype` actually should be `ncclDataType_t`
        # and `op` should be `ncclRedOp_t`
        # both are aliases of `ctypes.c_int`
        # when we pass int to a function, it will be converted to `ctypes.c_int`
        # by ctypes automatically
        self.NCCL_CHECK(
            self._funcs["ncclAllReduce"](
                sendbuff, recvbuff, count, datatype, op, comm, stream
            )
        )

    def ncclReduce(
        self,
        sendbuff: buffer_type,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        op: int,
        root: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        # `datatype` actually should be `ncclDataType_t`
        # and `op` should be `ncclRedOp_t`
        # both are aliases of `ctypes.c_int`
        # when we pass int to a function, it will be converted to `ctypes.c_int`
        # by ctypes automatically
        self.NCCL_CHECK(
            self._funcs["ncclReduce"](
                sendbuff, recvbuff, count, datatype, op, root, comm, stream
            )
        )

    def ncclReduceScatter(
        self,
        sendbuff: buffer_type,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        op: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        # `datatype` actually should be `ncclDataType_t`
        # and `op` should be `ncclRedOp_t`
        # both are aliases of `ctypes.c_int`
        # when we pass int to a function, it will be converted to `ctypes.c_int`
        # by ctypes automatically
        self.NCCL_CHECK(
            self._funcs["ncclReduceScatter"](
                sendbuff, recvbuff, count, datatype, op, comm, stream
            )
        )

    def ncclAllGather(
        self,
        sendbuff: buffer_type,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        # `datatype` actually should be `ncclDataType_t`
        # which is an aliases of `ctypes.c_int`
        # when we pass int to a function, it will be converted to `ctypes.c_int`
        # by ctypes automatically
        self.NCCL_CHECK(
            self._funcs["ncclAllGather"](
                sendbuff, recvbuff, count, datatype, comm, stream
            )
        )

    def ncclSend(
        self,
        sendbuff: buffer_type,
        count: int,
        datatype: int,
        dest: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        self.NCCL_CHECK(
            self._funcs["ncclSend"](sendbuff, count, datatype, dest, comm, stream)
        )

    def ncclRecv(
        self,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        src: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        self.NCCL_CHECK(
            self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)
        )

    def ncclBroadcast(
        self,
        sendbuff: buffer_type,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        root: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        self.NCCL_CHECK(
            self._funcs["ncclBroadcast"](
                sendbuff, recvbuff, count, datatype, root, comm, stream
            )
        )

    def ncclCommDestroy(self, comm: ncclComm_t) -> None:
        self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))

    def ncclGroupStart(self) -> None:
        self.NCCL_CHECK(self._funcs["ncclGroupStart"]())

    def ncclGroupEnd(self) -> None:
        self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())

    def ncclCommWindowRegister(
        self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int
    ) -> ncclWindow_t:
        window = ncclWindow_t()
        self.NCCL_CHECK(
            self._funcs["ncclCommWindowRegister"](
                comm, buff, size, ctypes.byref(window), win_flags
            )
        )
        return window

    def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None:
        self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window))

_funcs instance-attribute

_funcs = path_to_dict_mapping[so_file]

exported_functions class-attribute instance-attribute

exported_functions = [
    Function(
        "ncclGetErrorString", c_char_p, [ncclResult_t]
    ),
    Function(
        "ncclGetVersion", ncclResult_t, [POINTER(c_int)]
    ),
    Function(
        "ncclGetUniqueId",
        ncclResult_t,
        [POINTER(ncclUniqueId)],
    ),
    Function(
        "ncclCommInitRank",
        ncclResult_t,
        [POINTER(ncclComm_t), c_int, ncclUniqueId, c_int],
    ),
    Function(
        "ncclAllReduce",
        ncclResult_t,
        [
            buffer_type,
            buffer_type,
            c_size_t,
            ncclDataType_t,
            ncclRedOp_t,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclReduce",
        ncclResult_t,
        [
            buffer_type,
            buffer_type,
            c_size_t,
            ncclDataType_t,
            ncclRedOp_t,
            c_int,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclAllGather",
        ncclResult_t,
        [
            buffer_type,
            buffer_type,
            c_size_t,
            ncclDataType_t,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclReduceScatter",
        ncclResult_t,
        [
            buffer_type,
            buffer_type,
            c_size_t,
            ncclDataType_t,
            ncclRedOp_t,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclSend",
        ncclResult_t,
        [
            buffer_type,
            c_size_t,
            ncclDataType_t,
            c_int,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclRecv",
        ncclResult_t,
        [
            buffer_type,
            c_size_t,
            ncclDataType_t,
            c_int,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclBroadcast",
        ncclResult_t,
        [
            buffer_type,
            buffer_type,
            c_size_t,
            ncclDataType_t,
            c_int,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
    Function("ncclGroupStart", ncclResult_t, []),
    Function("ncclGroupEnd", ncclResult_t, []),
    Function(
        "ncclCommWindowRegister",
        ncclResult_t,
        [
            ncclComm_t,
            buffer_type,
            c_size_t,
            POINTER(ncclWindow_t),
            c_int,
        ],
    ),
    Function(
        "ncclCommWindowDeregister",
        ncclResult_t,
        [ncclComm_t, ncclWindow_t],
    ),
]

lib instance-attribute

lib = path_to_library_cache[so_file]

path_to_dict_mapping class-attribute instance-attribute

path_to_dict_mapping: dict[str, dict[str, Any]] = {}

path_to_library_cache class-attribute instance-attribute

path_to_library_cache: dict[str, Any] = {}

NCCL_CHECK

NCCL_CHECK(result: ncclResult_t) -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def NCCL_CHECK(self, result: ncclResult_t) -> None:
    if result != 0:
        error_str = self.ncclGetErrorString(result)
        raise RuntimeError(f"NCCL error: {error_str}")

__init__

__init__(so_file: Optional[str] = None)
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def __init__(self, so_file: Optional[str] = None):
    so_file = so_file or find_nccl_library()

    try:
        if so_file not in NCCLLibrary.path_to_dict_mapping:
            lib = ctypes.CDLL(so_file)
            NCCLLibrary.path_to_library_cache[so_file] = lib
        self.lib = NCCLLibrary.path_to_library_cache[so_file]
    except Exception as e:
        logger.error(
            "Failed to load NCCL library from %s. "
            "It is expected if you are not running on NVIDIA/AMD GPUs."
            "Otherwise, the nccl library might not exist, be corrupted "
            "or it does not support the current platform %s. "
            "If you already have the library, please set the "
            "environment variable VLLM_NCCL_SO_PATH"
            " to point to the correct nccl library path.",
            so_file,
            platform.platform(),
        )
        raise e

    if so_file not in NCCLLibrary.path_to_dict_mapping:
        _funcs: dict[str, Any] = {}
        for func in NCCLLibrary.exported_functions:
            try:
                f = getattr(self.lib, func.name)
                f.restype = func.restype
                f.argtypes = func.argtypes
                _funcs[func.name] = f
            except AttributeError:
                if func.name in [
                    "ncclCommWindowRegister",
                    "ncclCommWindowDeregister",
                ]:
                    if envs.VLLM_USE_NCCL_SYMM_MEM:
                        logger.warning_once(
                            "The symbol %s is not found in the NCCL "
                            "library %s. To enable VLLM_USE_NCCL_SYMM_MEM "
                            " please update your NCCL version to >= "
                            "2.27.03.",
                            func.name,
                            so_file,
                        )
                    if current_platform.is_rocm():
                        # Having an exception here on ROCm platform is
                        # not allowed during graph capturing
                        continue
                raise
        NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
    self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]

ncclAllGather

ncclAllGather(
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclAllGather(
    self,
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    # `datatype` actually should be `ncclDataType_t`
    # which is an aliases of `ctypes.c_int`
    # when we pass int to a function, it will be converted to `ctypes.c_int`
    # by ctypes automatically
    self.NCCL_CHECK(
        self._funcs["ncclAllGather"](
            sendbuff, recvbuff, count, datatype, comm, stream
        )
    )

ncclAllReduce

ncclAllReduce(
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclAllReduce(
    self,
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    # `datatype` actually should be `ncclDataType_t`
    # and `op` should be `ncclRedOp_t`
    # both are aliases of `ctypes.c_int`
    # when we pass int to a function, it will be converted to `ctypes.c_int`
    # by ctypes automatically
    self.NCCL_CHECK(
        self._funcs["ncclAllReduce"](
            sendbuff, recvbuff, count, datatype, op, comm, stream
        )
    )

ncclBroadcast

ncclBroadcast(
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    root: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclBroadcast(
    self,
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    root: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    self.NCCL_CHECK(
        self._funcs["ncclBroadcast"](
            sendbuff, recvbuff, count, datatype, root, comm, stream
        )
    )

ncclCommDestroy

ncclCommDestroy(comm: ncclComm_t) -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclCommDestroy(self, comm: ncclComm_t) -> None:
    self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))

ncclCommInitRank

ncclCommInitRank(
    world_size: int, unique_id: ncclUniqueId, rank: int
) -> ncclComm_t
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclCommInitRank(
    self, world_size: int, unique_id: ncclUniqueId, rank: int
) -> ncclComm_t:
    comm = ncclComm_t()
    self.NCCL_CHECK(
        self._funcs["ncclCommInitRank"](
            ctypes.byref(comm), world_size, unique_id, rank
        )
    )
    return comm

ncclCommWindowDeregister

ncclCommWindowDeregister(
    comm: ncclComm_t, window: ncclWindow_t
) -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None:
    self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window))

ncclCommWindowRegister

ncclCommWindowRegister(
    comm: ncclComm_t,
    buff: buffer_type,
    size: int,
    win_flags: int,
) -> ncclWindow_t
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclCommWindowRegister(
    self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int
) -> ncclWindow_t:
    window = ncclWindow_t()
    self.NCCL_CHECK(
        self._funcs["ncclCommWindowRegister"](
            comm, buff, size, ctypes.byref(window), win_flags
        )
    )
    return window

ncclGetErrorString

ncclGetErrorString(result: ncclResult_t) -> str
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclGetErrorString(self, result: ncclResult_t) -> str:
    return self._funcs["ncclGetErrorString"](result).decode("utf-8")

ncclGetRawVersion

ncclGetRawVersion() -> int
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclGetRawVersion(self) -> int:
    version = ctypes.c_int()
    self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
    # something like 21903
    return version.value

ncclGetUniqueId

ncclGetUniqueId() -> ncclUniqueId
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclGetUniqueId(self) -> ncclUniqueId:
    unique_id = ncclUniqueId()
    self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](ctypes.byref(unique_id)))
    return unique_id

ncclGetVersion

ncclGetVersion() -> str
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclGetVersion(self) -> str:
    version_str = str(self.ncclGetRawVersion())
    # something like 21903 --> "2.19.3"
    major = version_str[0].lstrip("0")
    minor = version_str[1:3].lstrip("0")
    patch = version_str[3:].lstrip("0")
    return f"{major}.{minor}.{patch}"

ncclGroupEnd

ncclGroupEnd() -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclGroupEnd(self) -> None:
    self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())

ncclGroupStart

ncclGroupStart() -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclGroupStart(self) -> None:
    self.NCCL_CHECK(self._funcs["ncclGroupStart"]())

ncclRecv

ncclRecv(
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    src: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclRecv(
    self,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    src: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    self.NCCL_CHECK(
        self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)
    )

ncclReduce

ncclReduce(
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    root: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclReduce(
    self,
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    root: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    # `datatype` actually should be `ncclDataType_t`
    # and `op` should be `ncclRedOp_t`
    # both are aliases of `ctypes.c_int`
    # when we pass int to a function, it will be converted to `ctypes.c_int`
    # by ctypes automatically
    self.NCCL_CHECK(
        self._funcs["ncclReduce"](
            sendbuff, recvbuff, count, datatype, op, root, comm, stream
        )
    )

ncclReduceScatter

ncclReduceScatter(
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclReduceScatter(
    self,
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    # `datatype` actually should be `ncclDataType_t`
    # and `op` should be `ncclRedOp_t`
    # both are aliases of `ctypes.c_int`
    # when we pass int to a function, it will be converted to `ctypes.c_int`
    # by ctypes automatically
    self.NCCL_CHECK(
        self._funcs["ncclReduceScatter"](
            sendbuff, recvbuff, count, datatype, op, comm, stream
        )
    )

ncclSend

ncclSend(
    sendbuff: buffer_type,
    count: int,
    datatype: int,
    dest: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def ncclSend(
    self,
    sendbuff: buffer_type,
    count: int,
    datatype: int,
    dest: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    self.NCCL_CHECK(
        self._funcs["ncclSend"](sendbuff, count, datatype, dest, comm, stream)
    )

unique_id_from_bytes

unique_id_from_bytes(data: bytes) -> ncclUniqueId
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId:
    if len(data) != 128:
        raise ValueError(
            f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes"
        )
    unique_id = ncclUniqueId()
    ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128)
    return unique_id

ncclDataTypeEnum

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
class ncclDataTypeEnum:
    ncclInt8 = 0
    ncclChar = 0
    ncclUint8 = 1
    ncclInt32 = 2
    ncclInt = 2
    ncclUint32 = 3
    ncclInt64 = 4
    ncclUint64 = 5
    ncclFloat16 = 6
    ncclHalf = 6
    ncclFloat32 = 7
    ncclFloat = 7
    ncclFloat64 = 8
    ncclDouble = 8
    ncclBfloat16 = 9
    ncclNumTypes = 10

    @classmethod
    def from_torch(cls, dtype: torch.dtype) -> int:
        if dtype == torch.int8:
            return cls.ncclInt8
        if dtype == torch.uint8:
            return cls.ncclUint8
        if dtype == torch.int32:
            return cls.ncclInt32
        if dtype == torch.int64:
            return cls.ncclInt64
        if dtype == torch.float16:
            return cls.ncclFloat16
        if dtype == torch.float32:
            return cls.ncclFloat32
        if dtype == torch.float64:
            return cls.ncclFloat64
        if dtype == torch.bfloat16:
            return cls.ncclBfloat16
        raise ValueError(f"Unsupported dtype: {dtype}")

ncclBfloat16 class-attribute instance-attribute

ncclBfloat16 = 9

ncclChar class-attribute instance-attribute

ncclChar = 0

ncclDouble class-attribute instance-attribute

ncclDouble = 8

ncclFloat class-attribute instance-attribute

ncclFloat = 7

ncclFloat16 class-attribute instance-attribute

ncclFloat16 = 6

ncclFloat32 class-attribute instance-attribute

ncclFloat32 = 7

ncclFloat64 class-attribute instance-attribute

ncclFloat64 = 8

ncclHalf class-attribute instance-attribute

ncclHalf = 6

ncclInt class-attribute instance-attribute

ncclInt = 2

ncclInt32 class-attribute instance-attribute

ncclInt32 = 2

ncclInt64 class-attribute instance-attribute

ncclInt64 = 4

ncclInt8 class-attribute instance-attribute

ncclInt8 = 0

ncclNumTypes class-attribute instance-attribute

ncclNumTypes = 10

ncclUint32 class-attribute instance-attribute

ncclUint32 = 3

ncclUint64 class-attribute instance-attribute

ncclUint64 = 5

ncclUint8 class-attribute instance-attribute

ncclUint8 = 1

from_torch classmethod

from_torch(dtype: dtype) -> int
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
@classmethod
def from_torch(cls, dtype: torch.dtype) -> int:
    if dtype == torch.int8:
        return cls.ncclInt8
    if dtype == torch.uint8:
        return cls.ncclUint8
    if dtype == torch.int32:
        return cls.ncclInt32
    if dtype == torch.int64:
        return cls.ncclInt64
    if dtype == torch.float16:
        return cls.ncclFloat16
    if dtype == torch.float32:
        return cls.ncclFloat32
    if dtype == torch.float64:
        return cls.ncclFloat64
    if dtype == torch.bfloat16:
        return cls.ncclBfloat16
    raise ValueError(f"Unsupported dtype: {dtype}")

ncclRedOpTypeEnum

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
class ncclRedOpTypeEnum:
    ncclSum = 0
    ncclProd = 1
    ncclMax = 2
    ncclMin = 3
    ncclAvg = 4
    ncclNumOps = 5

    @classmethod
    def from_torch(cls, op: ReduceOp) -> int:
        if op == ReduceOp.SUM:
            return cls.ncclSum
        if op == ReduceOp.PRODUCT:
            return cls.ncclProd
        if op == ReduceOp.MAX:
            return cls.ncclMax
        if op == ReduceOp.MIN:
            return cls.ncclMin
        if op == ReduceOp.AVG:
            return cls.ncclAvg
        raise ValueError(f"Unsupported op: {op}")

ncclAvg class-attribute instance-attribute

ncclAvg = 4

ncclMax class-attribute instance-attribute

ncclMax = 2

ncclMin class-attribute instance-attribute

ncclMin = 3

ncclNumOps class-attribute instance-attribute

ncclNumOps = 5

ncclProd class-attribute instance-attribute

ncclProd = 1

ncclSum class-attribute instance-attribute

ncclSum = 0

from_torch classmethod

from_torch(op: ReduceOp) -> int
Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
@classmethod
def from_torch(cls, op: ReduceOp) -> int:
    if op == ReduceOp.SUM:
        return cls.ncclSum
    if op == ReduceOp.PRODUCT:
        return cls.ncclProd
    if op == ReduceOp.MAX:
        return cls.ncclMax
    if op == ReduceOp.MIN:
        return cls.ncclMin
    if op == ReduceOp.AVG:
        return cls.ncclAvg
    raise ValueError(f"Unsupported op: {op}")

ncclUniqueId

Bases: Structure

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py
class ncclUniqueId(ctypes.Structure):
    _fields_ = [("internal", ctypes.c_byte * 128)]

_fields_ class-attribute instance-attribute

_fields_ = [('internal', c_byte * 128)]