vllm.distributed.device_communicators.pynccl_wrapper ¶

all `module-attribute` ¶

__all__ = [
    "NCCLLibrary",
    "ncclDataTypeEnum",
    "ncclRedOpTypeEnum",
    "ncclUniqueId",
    "ncclComm_t",
    "cudaStream_t",
    "buffer_type",
]

buffer_type `module-attribute` ¶

buffer_type = c_void_p

cudaStream_t `module-attribute` ¶

cudaStream_t = c_void_p

logger `module-attribute` ¶

logger = init_logger(__name__)

ncclComm_t `module-attribute` ¶

ncclComm_t = c_void_p

ncclDataType_t `module-attribute` ¶

ncclDataType_t = c_int

ncclRedOp_t `module-attribute` ¶

ncclRedOp_t = c_int

ncclResult_t `module-attribute` ¶

ncclResult_t = c_int

ncclWindow_t `module-attribute` ¶

ncclWindow_t = c_void_p

Function `dataclass` ¶

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

@dataclass
class Function:
    name: str
    restype: Any
    argtypes: list[Any]

argtypes `instance-attribute` ¶

argtypes: list[Any]

name `instance-attribute` ¶

name: str

restype `instance-attribute` ¶

restype: Any

init ¶

__init__(
    name: str, restype: Any, argtypes: list[Any]
) -> None

NCCLLibrary ¶

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

class NCCLLibrary:
    exported_functions = [
        # const char* ncclGetErrorString(ncclResult_t result)
        Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
        # ncclResult_t  ncclGetVersion(int *version);
        Function("ncclGetVersion", ncclResult_t, [ctypes.POINTER(ctypes.c_int)]),
        # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
        Function("ncclGetUniqueId", ncclResult_t, [ctypes.POINTER(ncclUniqueId)]),
        # ncclResult_t  ncclCommInitRank(
        #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
        # note that ncclComm_t is a pointer type, so the first argument
        # is a pointer to a pointer
        Function(
            "ncclCommInitRank",
            ncclResult_t,
            [ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId, ctypes.c_int],
        ),
        # ncclResult_t  ncclAllReduce(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
        #   cudaStream_t stream);
        # note that cudaStream_t is a pointer type, so the last argument
        # is a pointer
        Function(
            "ncclAllReduce",
            ncclResult_t,
            [
                buffer_type,
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ncclRedOp_t,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t  ncclReduce(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, ncclRedOp_t op, int root,
        #   ncclComm_t comm,  cudaStream_t stream);
        # note that cudaStream_t is a pointer type, so the last argument
        # is a pointer
        Function(
            "ncclReduce",
            ncclResult_t,
            [
                buffer_type,
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ncclRedOp_t,
                ctypes.c_int,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t  ncclAllGather(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, ncclComm_t comm,
        #   cudaStream_t stream);
        # note that cudaStream_t is a pointer type, so the last argument
        # is a pointer
        Function(
            "ncclAllGather",
            ncclResult_t,
            [
                buffer_type,
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t  ncclReduceScatter(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
        #   cudaStream_t stream);
        # note that cudaStream_t is a pointer type, so the last argument
        # is a pointer
        Function(
            "ncclReduceScatter",
            ncclResult_t,
            [
                buffer_type,
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ncclRedOp_t,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t  ncclSend(
        #   const void* sendbuff, size_t count, ncclDataType_t datatype,
        #   int dest, ncclComm_t comm, cudaStream_t stream);
        Function(
            "ncclSend",
            ncclResult_t,
            [
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ctypes.c_int,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t  ncclRecv(
        #   void* recvbuff, size_t count, ncclDataType_t datatype,
        #   int src, ncclComm_t comm, cudaStream_t stream);
        Function(
            "ncclRecv",
            ncclResult_t,
            [
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ctypes.c_int,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # ncclResult_t ncclBroadcast(
        #   const void* sendbuff, void* recvbuff, size_t count,
        #   ncclDataType_t datatype, int root, ncclComm_t comm,
        #   cudaStream_t stream);
        Function(
            "ncclBroadcast",
            ncclResult_t,
            [
                buffer_type,
                buffer_type,
                ctypes.c_size_t,
                ncclDataType_t,
                ctypes.c_int,
                ncclComm_t,
                cudaStream_t,
            ],
        ),
        # be cautious! this is a collective call, it will block until all
        # processes in the communicator have called this function.
        # because Python object destruction can happen in random order,
        # it is better not to call it at all.
        # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
        Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
        # ncclResult_t ncclGroupStart();
        Function("ncclGroupStart", ncclResult_t, []),
        # ncclResult_t ncclGroupEnd();
        Function("ncclGroupEnd", ncclResult_t, []),
        # ncclResult_t ncclCommWindowRegister(
        #   ncclComm_t comm, void* buff, size_t size,
        #   ncclWindow_t* win, int winFlags);
        Function(
            "ncclCommWindowRegister",
            ncclResult_t,
            [
                ncclComm_t,
                buffer_type,
                ctypes.c_size_t,
                ctypes.POINTER(ncclWindow_t),
                ctypes.c_int,
            ],
        ),
        # ncclResult_t ncclCommWindowDeregister(
        #   ncclComm_t comm, ncclWindow_t win);
        Function("ncclCommWindowDeregister", ncclResult_t, [ncclComm_t, ncclWindow_t]),
    ]

    # class attribute to store the mapping from the path to the library
    # to avoid loading the same library multiple times
    path_to_library_cache: dict[str, Any] = {}

    # class attribute to store the mapping from library path
    #  to the corresponding dictionary
    path_to_dict_mapping: dict[str, dict[str, Any]] = {}

    def __init__(self, so_file: Optional[str] = None):
        so_file = so_file or find_nccl_library()

        try:
            if so_file not in NCCLLibrary.path_to_dict_mapping:
                lib = ctypes.CDLL(so_file)
                NCCLLibrary.path_to_library_cache[so_file] = lib
            self.lib = NCCLLibrary.path_to_library_cache[so_file]
        except Exception as e:
            logger.error(
                "Failed to load NCCL library from %s. "
                "It is expected if you are not running on NVIDIA/AMD GPUs."
                "Otherwise, the nccl library might not exist, be corrupted "
                "or it does not support the current platform %s. "
                "If you already have the library, please set the "
                "environment variable VLLM_NCCL_SO_PATH"
                " to point to the correct nccl library path.",
                so_file,
                platform.platform(),
            )
            raise e

        if so_file not in NCCLLibrary.path_to_dict_mapping:
            _funcs: dict[str, Any] = {}
            for func in NCCLLibrary.exported_functions:
                try:
                    f = getattr(self.lib, func.name)
                    f.restype = func.restype
                    f.argtypes = func.argtypes
                    _funcs[func.name] = f
                except AttributeError:
                    if func.name in [
                        "ncclCommWindowRegister",
                        "ncclCommWindowDeregister",
                    ]:
                        if envs.VLLM_USE_NCCL_SYMM_MEM:
                            logger.warning_once(
                                "The symbol %s is not found in the NCCL "
                                "library %s. To enable VLLM_USE_NCCL_SYMM_MEM "
                                " please update your NCCL version to >= "
                                "2.27.03.",
                                func.name,
                                so_file,
                            )
                        if current_platform.is_rocm():
                            # Having an exception here on ROCm platform is
                            # not allowed during graph capturing
                            continue
                    raise
            NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
        self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]

    def ncclGetErrorString(self, result: ncclResult_t) -> str:
        return self._funcs["ncclGetErrorString"](result).decode("utf-8")

    def NCCL_CHECK(self, result: ncclResult_t) -> None:
        if result != 0:
            error_str = self.ncclGetErrorString(result)
            raise RuntimeError(f"NCCL error: {error_str}")

    def ncclGetRawVersion(self) -> int:
        version = ctypes.c_int()
        self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
        # something like 21903
        return version.value

    def ncclGetVersion(self) -> str:
        version_str = str(self.ncclGetRawVersion())
        # something like 21903 --> "2.19.3"
        major = version_str[0].lstrip("0")
        minor = version_str[1:3].lstrip("0")
        patch = version_str[3:].lstrip("0")
        return f"{major}.{minor}.{patch}"

    def ncclGetUniqueId(self) -> ncclUniqueId:
        unique_id = ncclUniqueId()
        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](ctypes.byref(unique_id)))
        return unique_id

    def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId:
        if len(data) != 128:
            raise ValueError(
                f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes"
            )
        unique_id = ncclUniqueId()
        ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128)
        return unique_id

    def ncclCommInitRank(
        self, world_size: int, unique_id: ncclUniqueId, rank: int
    ) -> ncclComm_t:
        comm = ncclComm_t()
        self.NCCL_CHECK(
            self._funcs["ncclCommInitRank"](
                ctypes.byref(comm), world_size, unique_id, rank
            )
        )
        return comm

    def ncclAllReduce(
        self,
        sendbuff: buffer_type,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        op: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        # `datatype` actually should be `ncclDataType_t`
        # and `op` should be `ncclRedOp_t`
        # both are aliases of `ctypes.c_int`
        # when we pass int to a function, it will be converted to `ctypes.c_int`
        # by ctypes automatically
        self.NCCL_CHECK(
            self._funcs["ncclAllReduce"](
                sendbuff, recvbuff, count, datatype, op, comm, stream
            )
        )

    def ncclReduce(
        self,
        sendbuff: buffer_type,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        op: int,
        root: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        # `datatype` actually should be `ncclDataType_t`
        # and `op` should be `ncclRedOp_t`
        # both are aliases of `ctypes.c_int`
        # when we pass int to a function, it will be converted to `ctypes.c_int`
        # by ctypes automatically
        self.NCCL_CHECK(
            self._funcs["ncclReduce"](
                sendbuff, recvbuff, count, datatype, op, root, comm, stream
            )
        )

    def ncclReduceScatter(
        self,
        sendbuff: buffer_type,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        op: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        # `datatype` actually should be `ncclDataType_t`
        # and `op` should be `ncclRedOp_t`
        # both are aliases of `ctypes.c_int`
        # when we pass int to a function, it will be converted to `ctypes.c_int`
        # by ctypes automatically
        self.NCCL_CHECK(
            self._funcs["ncclReduceScatter"](
                sendbuff, recvbuff, count, datatype, op, comm, stream
            )
        )

    def ncclAllGather(
        self,
        sendbuff: buffer_type,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        # `datatype` actually should be `ncclDataType_t`
        # which is an aliases of `ctypes.c_int`
        # when we pass int to a function, it will be converted to `ctypes.c_int`
        # by ctypes automatically
        self.NCCL_CHECK(
            self._funcs["ncclAllGather"](
                sendbuff, recvbuff, count, datatype, comm, stream
            )
        )

    def ncclSend(
        self,
        sendbuff: buffer_type,
        count: int,
        datatype: int,
        dest: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        self.NCCL_CHECK(
            self._funcs["ncclSend"](sendbuff, count, datatype, dest, comm, stream)
        )

    def ncclRecv(
        self,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        src: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        self.NCCL_CHECK(
            self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)
        )

    def ncclBroadcast(
        self,
        sendbuff: buffer_type,
        recvbuff: buffer_type,
        count: int,
        datatype: int,
        root: int,
        comm: ncclComm_t,
        stream: cudaStream_t,
    ) -> None:
        self.NCCL_CHECK(
            self._funcs["ncclBroadcast"](
                sendbuff, recvbuff, count, datatype, root, comm, stream
            )
        )

    def ncclCommDestroy(self, comm: ncclComm_t) -> None:
        self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))

    def ncclGroupStart(self) -> None:
        self.NCCL_CHECK(self._funcs["ncclGroupStart"]())

    def ncclGroupEnd(self) -> None:
        self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())

    def ncclCommWindowRegister(
        self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int
    ) -> ncclWindow_t:
        window = ncclWindow_t()
        self.NCCL_CHECK(
            self._funcs["ncclCommWindowRegister"](
                comm, buff, size, ctypes.byref(window), win_flags
            )
        )
        return window

    def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None:
        self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window))

_funcs `instance-attribute` ¶

_funcs = path_to_dict_mapping[so_file]

exported_functions `class-attribute` `instance-attribute` ¶

exported_functions = [
    Function(
        "ncclGetErrorString", c_char_p, [ncclResult_t]
    ),
    Function(
        "ncclGetVersion", ncclResult_t, [POINTER(c_int)]
    ),
    Function(
        "ncclGetUniqueId",
        ncclResult_t,
        [POINTER(ncclUniqueId)],
    ),
    Function(
        "ncclCommInitRank",
        ncclResult_t,
        [POINTER(ncclComm_t), c_int, ncclUniqueId, c_int],
    ),
    Function(
        "ncclAllReduce",
        ncclResult_t,
        [
            buffer_type,
            buffer_type,
            c_size_t,
            ncclDataType_t,
            ncclRedOp_t,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclReduce",
        ncclResult_t,
        [
            buffer_type,
            buffer_type,
            c_size_t,
            ncclDataType_t,
            ncclRedOp_t,
            c_int,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclAllGather",
        ncclResult_t,
        [
            buffer_type,
            buffer_type,
            c_size_t,
            ncclDataType_t,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclReduceScatter",
        ncclResult_t,
        [
            buffer_type,
            buffer_type,
            c_size_t,
            ncclDataType_t,
            ncclRedOp_t,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclSend",
        ncclResult_t,
        [
            buffer_type,
            c_size_t,
            ncclDataType_t,
            c_int,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclRecv",
        ncclResult_t,
        [
            buffer_type,
            c_size_t,
            ncclDataType_t,
            c_int,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function(
        "ncclBroadcast",
        ncclResult_t,
        [
            buffer_type,
            buffer_type,
            c_size_t,
            ncclDataType_t,
            c_int,
            ncclComm_t,
            cudaStream_t,
        ],
    ),
    Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
    Function("ncclGroupStart", ncclResult_t, []),
    Function("ncclGroupEnd", ncclResult_t, []),
    Function(
        "ncclCommWindowRegister",
        ncclResult_t,
        [
            ncclComm_t,
            buffer_type,
            c_size_t,
            POINTER(ncclWindow_t),
            c_int,
        ],
    ),
    Function(
        "ncclCommWindowDeregister",
        ncclResult_t,
        [ncclComm_t, ncclWindow_t],
    ),
]

lib `instance-attribute` ¶

lib = path_to_library_cache[so_file]

path_to_dict_mapping `class-attribute` `instance-attribute` ¶

path_to_dict_mapping: dict[str, dict[str, Any]] = {}

path_to_library_cache `class-attribute` `instance-attribute` ¶

path_to_library_cache: dict[str, Any] = {}

NCCL_CHECK ¶

NCCL_CHECK(result: ncclResult_t) -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def NCCL_CHECK(self, result: ncclResult_t) -> None:
    if result != 0:
        error_str = self.ncclGetErrorString(result)
        raise RuntimeError(f"NCCL error: {error_str}")

init ¶

__init__(so_file: Optional[str] = None)

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def __init__(self, so_file: Optional[str] = None):
    so_file = so_file or find_nccl_library()

    try:
        if so_file not in NCCLLibrary.path_to_dict_mapping:
            lib = ctypes.CDLL(so_file)
            NCCLLibrary.path_to_library_cache[so_file] = lib
        self.lib = NCCLLibrary.path_to_library_cache[so_file]
    except Exception as e:
        logger.error(
            "Failed to load NCCL library from %s. "
            "It is expected if you are not running on NVIDIA/AMD GPUs."
            "Otherwise, the nccl library might not exist, be corrupted "
            "or it does not support the current platform %s. "
            "If you already have the library, please set the "
            "environment variable VLLM_NCCL_SO_PATH"
            " to point to the correct nccl library path.",
            so_file,
            platform.platform(),
        )
        raise e

    if so_file not in NCCLLibrary.path_to_dict_mapping:
        _funcs: dict[str, Any] = {}
        for func in NCCLLibrary.exported_functions:
            try:
                f = getattr(self.lib, func.name)
                f.restype = func.restype
                f.argtypes = func.argtypes
                _funcs[func.name] = f
            except AttributeError:
                if func.name in [
                    "ncclCommWindowRegister",
                    "ncclCommWindowDeregister",
                ]:
                    if envs.VLLM_USE_NCCL_SYMM_MEM:
                        logger.warning_once(
                            "The symbol %s is not found in the NCCL "
                            "library %s. To enable VLLM_USE_NCCL_SYMM_MEM "
                            " please update your NCCL version to >= "
                            "2.27.03.",
                            func.name,
                            so_file,
                        )
                    if current_platform.is_rocm():
                        # Having an exception here on ROCm platform is
                        # not allowed during graph capturing
                        continue
                raise
        NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
    self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]

ncclAllGather ¶

ncclAllGather(
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclAllGather(
    self,
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    # `datatype` actually should be `ncclDataType_t`
    # which is an aliases of `ctypes.c_int`
    # when we pass int to a function, it will be converted to `ctypes.c_int`
    # by ctypes automatically
    self.NCCL_CHECK(
        self._funcs["ncclAllGather"](
            sendbuff, recvbuff, count, datatype, comm, stream
        )
    )

ncclAllReduce ¶

ncclAllReduce(
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclAllReduce(
    self,
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    # `datatype` actually should be `ncclDataType_t`
    # and `op` should be `ncclRedOp_t`
    # both are aliases of `ctypes.c_int`
    # when we pass int to a function, it will be converted to `ctypes.c_int`
    # by ctypes automatically
    self.NCCL_CHECK(
        self._funcs["ncclAllReduce"](
            sendbuff, recvbuff, count, datatype, op, comm, stream
        )
    )

ncclBroadcast ¶

ncclBroadcast(
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    root: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclBroadcast(
    self,
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    root: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    self.NCCL_CHECK(
        self._funcs["ncclBroadcast"](
            sendbuff, recvbuff, count, datatype, root, comm, stream
        )
    )

ncclCommDestroy ¶

ncclCommDestroy(comm: ncclComm_t) -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclCommDestroy(self, comm: ncclComm_t) -> None:
    self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))

ncclCommInitRank ¶

ncclCommInitRank(
    world_size: int, unique_id: ncclUniqueId, rank: int
) -> ncclComm_t

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclCommInitRank(
    self, world_size: int, unique_id: ncclUniqueId, rank: int
) -> ncclComm_t:
    comm = ncclComm_t()
    self.NCCL_CHECK(
        self._funcs["ncclCommInitRank"](
            ctypes.byref(comm), world_size, unique_id, rank
        )
    )
    return comm

ncclCommWindowDeregister ¶

ncclCommWindowDeregister(
    comm: ncclComm_t, window: ncclWindow_t
) -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None:
    self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window))

ncclCommWindowRegister ¶

ncclCommWindowRegister(
    comm: ncclComm_t,
    buff: buffer_type,
    size: int,
    win_flags: int,
) -> ncclWindow_t

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclCommWindowRegister(
    self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int
) -> ncclWindow_t:
    window = ncclWindow_t()
    self.NCCL_CHECK(
        self._funcs["ncclCommWindowRegister"](
            comm, buff, size, ctypes.byref(window), win_flags
        )
    )
    return window

ncclGetErrorString ¶

ncclGetErrorString(result: ncclResult_t) -> str

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclGetErrorString(self, result: ncclResult_t) -> str:
    return self._funcs["ncclGetErrorString"](result).decode("utf-8")

ncclGetRawVersion ¶

ncclGetRawVersion() -> int

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclGetRawVersion(self) -> int:
    version = ctypes.c_int()
    self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
    # something like 21903
    return version.value

ncclGetUniqueId ¶

ncclGetUniqueId() -> ncclUniqueId

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclGetUniqueId(self) -> ncclUniqueId:
    unique_id = ncclUniqueId()
    self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](ctypes.byref(unique_id)))
    return unique_id

ncclGetVersion ¶

ncclGetVersion() -> str

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclGetVersion(self) -> str:
    version_str = str(self.ncclGetRawVersion())
    # something like 21903 --> "2.19.3"
    major = version_str[0].lstrip("0")
    minor = version_str[1:3].lstrip("0")
    patch = version_str[3:].lstrip("0")
    return f"{major}.{minor}.{patch}"

ncclGroupEnd ¶

ncclGroupEnd() -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclGroupEnd(self) -> None:
    self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())

ncclGroupStart ¶

ncclGroupStart() -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclGroupStart(self) -> None:
    self.NCCL_CHECK(self._funcs["ncclGroupStart"]())

ncclRecv ¶

ncclRecv(
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    src: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclRecv(
    self,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    src: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    self.NCCL_CHECK(
        self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)
    )

ncclReduce ¶

ncclReduce(
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    root: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclReduce(
    self,
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    root: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    # `datatype` actually should be `ncclDataType_t`
    # and `op` should be `ncclRedOp_t`
    # both are aliases of `ctypes.c_int`
    # when we pass int to a function, it will be converted to `ctypes.c_int`
    # by ctypes automatically
    self.NCCL_CHECK(
        self._funcs["ncclReduce"](
            sendbuff, recvbuff, count, datatype, op, root, comm, stream
        )
    )

ncclReduceScatter ¶

ncclReduceScatter(
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclReduceScatter(
    self,
    sendbuff: buffer_type,
    recvbuff: buffer_type,
    count: int,
    datatype: int,
    op: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    # `datatype` actually should be `ncclDataType_t`
    # and `op` should be `ncclRedOp_t`
    # both are aliases of `ctypes.c_int`
    # when we pass int to a function, it will be converted to `ctypes.c_int`
    # by ctypes automatically
    self.NCCL_CHECK(
        self._funcs["ncclReduceScatter"](
            sendbuff, recvbuff, count, datatype, op, comm, stream
        )
    )

ncclSend ¶

ncclSend(
    sendbuff: buffer_type,
    count: int,
    datatype: int,
    dest: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def ncclSend(
    self,
    sendbuff: buffer_type,
    count: int,
    datatype: int,
    dest: int,
    comm: ncclComm_t,
    stream: cudaStream_t,
) -> None:
    self.NCCL_CHECK(
        self._funcs["ncclSend"](sendbuff, count, datatype, dest, comm, stream)
    )

unique_id_from_bytes ¶

unique_id_from_bytes(data: bytes) -> ncclUniqueId

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId:
    if len(data) != 128:
        raise ValueError(
            f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes"
        )
    unique_id = ncclUniqueId()
    ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128)
    return unique_id

ncclDataTypeEnum ¶

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

class ncclDataTypeEnum:
    ncclInt8 = 0
    ncclChar = 0
    ncclUint8 = 1
    ncclInt32 = 2
    ncclInt = 2
    ncclUint32 = 3
    ncclInt64 = 4
    ncclUint64 = 5
    ncclFloat16 = 6
    ncclHalf = 6
    ncclFloat32 = 7
    ncclFloat = 7
    ncclFloat64 = 8
    ncclDouble = 8
    ncclBfloat16 = 9
    ncclNumTypes = 10

    @classmethod
    def from_torch(cls, dtype: torch.dtype) -> int:
        if dtype == torch.int8:
            return cls.ncclInt8
        if dtype == torch.uint8:
            return cls.ncclUint8
        if dtype == torch.int32:
            return cls.ncclInt32
        if dtype == torch.int64:
            return cls.ncclInt64
        if dtype == torch.float16:
            return cls.ncclFloat16
        if dtype == torch.float32:
            return cls.ncclFloat32
        if dtype == torch.float64:
            return cls.ncclFloat64
        if dtype == torch.bfloat16:
            return cls.ncclBfloat16
        raise ValueError(f"Unsupported dtype: {dtype}")

ncclBfloat16 `class-attribute` `instance-attribute` ¶

ncclBfloat16 = 9

ncclChar `class-attribute` `instance-attribute` ¶

ncclChar = 0

ncclDouble `class-attribute` `instance-attribute` ¶

ncclDouble = 8

ncclFloat `class-attribute` `instance-attribute` ¶

ncclFloat = 7

ncclFloat16 `class-attribute` `instance-attribute` ¶

ncclFloat16 = 6

ncclFloat32 `class-attribute` `instance-attribute` ¶

ncclFloat32 = 7

ncclFloat64 `class-attribute` `instance-attribute` ¶

ncclFloat64 = 8

ncclHalf `class-attribute` `instance-attribute` ¶

ncclHalf = 6

ncclInt `class-attribute` `instance-attribute` ¶

ncclInt = 2

ncclInt32 `class-attribute` `instance-attribute` ¶

ncclInt32 = 2

ncclInt64 `class-attribute` `instance-attribute` ¶

ncclInt64 = 4

ncclInt8 `class-attribute` `instance-attribute` ¶

ncclInt8 = 0

ncclNumTypes `class-attribute` `instance-attribute` ¶

ncclNumTypes = 10

ncclUint32 `class-attribute` `instance-attribute` ¶

ncclUint32 = 3

ncclUint64 `class-attribute` `instance-attribute` ¶

ncclUint64 = 5

ncclUint8 `class-attribute` `instance-attribute` ¶

ncclUint8 = 1

from_torch `classmethod` ¶

from_torch(dtype: dtype) -> int

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

@classmethod
def from_torch(cls, dtype: torch.dtype) -> int:
    if dtype == torch.int8:
        return cls.ncclInt8
    if dtype == torch.uint8:
        return cls.ncclUint8
    if dtype == torch.int32:
        return cls.ncclInt32
    if dtype == torch.int64:
        return cls.ncclInt64
    if dtype == torch.float16:
        return cls.ncclFloat16
    if dtype == torch.float32:
        return cls.ncclFloat32
    if dtype == torch.float64:
        return cls.ncclFloat64
    if dtype == torch.bfloat16:
        return cls.ncclBfloat16
    raise ValueError(f"Unsupported dtype: {dtype}")

ncclRedOpTypeEnum ¶

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

class ncclRedOpTypeEnum:
    ncclSum = 0
    ncclProd = 1
    ncclMax = 2
    ncclMin = 3
    ncclAvg = 4
    ncclNumOps = 5

    @classmethod
    def from_torch(cls, op: ReduceOp) -> int:
        if op == ReduceOp.SUM:
            return cls.ncclSum
        if op == ReduceOp.PRODUCT:
            return cls.ncclProd
        if op == ReduceOp.MAX:
            return cls.ncclMax
        if op == ReduceOp.MIN:
            return cls.ncclMin
        if op == ReduceOp.AVG:
            return cls.ncclAvg
        raise ValueError(f"Unsupported op: {op}")

ncclAvg `class-attribute` `instance-attribute` ¶

ncclAvg = 4

ncclMax `class-attribute` `instance-attribute` ¶

ncclMax = 2

ncclMin `class-attribute` `instance-attribute` ¶

ncclMin = 3

ncclNumOps `class-attribute` `instance-attribute` ¶

ncclNumOps = 5

ncclProd `class-attribute` `instance-attribute` ¶

ncclProd = 1

ncclSum `class-attribute` `instance-attribute` ¶

ncclSum = 0

from_torch `classmethod` ¶

from_torch(op: ReduceOp) -> int

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

@classmethod
def from_torch(cls, op: ReduceOp) -> int:
    if op == ReduceOp.SUM:
        return cls.ncclSum
    if op == ReduceOp.PRODUCT:
        return cls.ncclProd
    if op == ReduceOp.MAX:
        return cls.ncclMax
    if op == ReduceOp.MIN:
        return cls.ncclMin
    if op == ReduceOp.AVG:
        return cls.ncclAvg
    raise ValueError(f"Unsupported op: {op}")

ncclUniqueId ¶

Bases: Structure

Source code in vllm/distributed/device_communicators/pynccl_wrapper.py

class ncclUniqueId(ctypes.Structure):
    _fields_ = [("internal", ctypes.c_byte * 128)]

_fields_ `class-attribute` `instance-attribute` ¶

_fields_ = [('internal', c_byte * 128)]

vllm.distributed.device_communicators.pynccl_wrapper ¶

__all__ module-attribute ¶

buffer_type module-attribute ¶

cudaStream_t module-attribute ¶

logger module-attribute ¶

ncclComm_t module-attribute ¶

ncclDataType_t module-attribute ¶

ncclRedOp_t module-attribute ¶

ncclResult_t module-attribute ¶

ncclWindow_t module-attribute ¶

Function dataclass ¶

argtypes instance-attribute ¶

name instance-attribute ¶

restype instance-attribute ¶

__init__ ¶

NCCLLibrary ¶

_funcs instance-attribute ¶

exported_functions class-attribute instance-attribute ¶

lib instance-attribute ¶

path_to_dict_mapping class-attribute instance-attribute ¶

path_to_library_cache class-attribute instance-attribute ¶

NCCL_CHECK ¶

__init__ ¶

ncclAllGather ¶

ncclAllReduce ¶

ncclBroadcast ¶

ncclCommDestroy ¶

ncclCommInitRank ¶

ncclCommWindowDeregister ¶

ncclCommWindowRegister ¶

ncclGetErrorString ¶

ncclGetRawVersion ¶

ncclGetUniqueId ¶

ncclGetVersion ¶

ncclGroupEnd ¶

ncclGroupStart ¶

ncclRecv ¶

ncclReduce ¶

ncclReduceScatter ¶

ncclSend ¶

unique_id_from_bytes ¶

ncclDataTypeEnum ¶

ncclBfloat16 class-attribute instance-attribute ¶

ncclChar class-attribute instance-attribute ¶

ncclDouble class-attribute instance-attribute ¶

ncclFloat class-attribute instance-attribute ¶

ncclFloat16 class-attribute instance-attribute ¶

ncclFloat32 class-attribute instance-attribute ¶

ncclFloat64 class-attribute instance-attribute ¶

ncclHalf class-attribute instance-attribute ¶

ncclInt class-attribute instance-attribute ¶

ncclInt32 class-attribute instance-attribute ¶

ncclInt64 class-attribute instance-attribute ¶

ncclInt8 class-attribute instance-attribute ¶

ncclNumTypes class-attribute instance-attribute ¶

ncclUint32 class-attribute instance-attribute ¶

ncclUint64 class-attribute instance-attribute ¶

ncclUint8 class-attribute instance-attribute ¶

from_torch classmethod ¶

ncclRedOpTypeEnum ¶

ncclAvg class-attribute instance-attribute ¶

ncclMax class-attribute instance-attribute ¶

ncclMin class-attribute instance-attribute ¶

ncclNumOps class-attribute instance-attribute ¶

ncclProd class-attribute instance-attribute ¶

ncclSum class-attribute instance-attribute ¶

from_torch classmethod ¶

ncclUniqueId ¶

_fields_ class-attribute instance-attribute ¶

all `module-attribute` ¶

buffer_type `module-attribute` ¶

cudaStream_t `module-attribute` ¶

logger `module-attribute` ¶

ncclComm_t `module-attribute` ¶

ncclDataType_t `module-attribute` ¶

ncclRedOp_t `module-attribute` ¶

ncclResult_t `module-attribute` ¶

ncclWindow_t `module-attribute` ¶

Function `dataclass` ¶

argtypes `instance-attribute` ¶

name `instance-attribute` ¶

restype `instance-attribute` ¶

init ¶

_funcs `instance-attribute` ¶

exported_functions `class-attribute` `instance-attribute` ¶

lib `instance-attribute` ¶

path_to_dict_mapping `class-attribute` `instance-attribute` ¶

path_to_library_cache `class-attribute` `instance-attribute` ¶

init ¶

ncclBfloat16 `class-attribute` `instance-attribute` ¶

ncclChar `class-attribute` `instance-attribute` ¶

ncclDouble `class-attribute` `instance-attribute` ¶

ncclFloat `class-attribute` `instance-attribute` ¶

ncclFloat16 `class-attribute` `instance-attribute` ¶

ncclFloat32 `class-attribute` `instance-attribute` ¶

ncclFloat64 `class-attribute` `instance-attribute` ¶

ncclHalf `class-attribute` `instance-attribute` ¶

ncclInt `class-attribute` `instance-attribute` ¶

ncclInt32 `class-attribute` `instance-attribute` ¶

ncclInt64 `class-attribute` `instance-attribute` ¶

ncclInt8 `class-attribute` `instance-attribute` ¶

ncclNumTypes `class-attribute` `instance-attribute` ¶

ncclUint32 `class-attribute` `instance-attribute` ¶

ncclUint64 `class-attribute` `instance-attribute` ¶

ncclUint8 `class-attribute` `instance-attribute` ¶

from_torch `classmethod` ¶

ncclAvg `class-attribute` `instance-attribute` ¶

ncclMax `class-attribute` `instance-attribute` ¶

ncclMin `class-attribute` `instance-attribute` ¶

ncclNumOps `class-attribute` `instance-attribute` ¶

ncclProd `class-attribute` `instance-attribute` ¶

ncclSum `class-attribute` `instance-attribute` ¶

from_torch `classmethod` ¶

_fields_ `class-attribute` `instance-attribute` ¶