Skip to content

vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe

ActivationMethod

Bases: IntEnum

Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
class ActivationMethod(IntEnum):
    # This allows interfacing with AITER ActivationType enum
    # without importing the ActivationType enum from AITER globally.
    SILU = 0
    GELU = 1

GELU class-attribute instance-attribute

GELU = 1

SILU class-attribute instance-attribute

SILU = 0

QuantMethod

Bases: IntEnum

Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
class QuantMethod(IntEnum):
    # This allows interfacing with AITER QuantType Enum
    # without importing the QuantType from AITER globally.

    # Note that these quantization methods are
    # supported in AITER package. However,
    # not all are used in this module.

    NO = 0  # a16w16
    PER_TENSOR = 1  # w8a8 (pre_Tensor)
    PER_TOKEN = 2  # w8a8/w8a4 (per_Token)
    BLOCK_1X32 = 3  # fp4x2
    BLOCK_1X128 = 4  # block quantized w8a8 (per_1x128)
    BLOCK_128x128 = 5  # block quantized w8a8 (per_128x128)

BLOCK_128x128 class-attribute instance-attribute

BLOCK_128x128 = 5

BLOCK_1X128 class-attribute instance-attribute

BLOCK_1X128 = 4

BLOCK_1X32 class-attribute instance-attribute

BLOCK_1X32 = 3

NO class-attribute instance-attribute

NO = 0

PER_TENSOR class-attribute instance-attribute

PER_TENSOR = 1

PER_TOKEN class-attribute instance-attribute

PER_TOKEN = 2

is_rocm_aiter_moe_enabled cached

is_rocm_aiter_moe_enabled() -> bool
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@cache
def is_rocm_aiter_moe_enabled() -> bool:
    return (
        current_platform.is_rocm()
        and envs.VLLM_ROCM_USE_AITER_MOE
        and envs.VLLM_ROCM_USE_AITER
    )

rocm_aiter_asm_moe_tkw1_fake

rocm_aiter_asm_moe_tkw1_fake(
    hidden_states: Tensor,
    w1: Tensor,
    w2: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    fc1_scale: Optional[Tensor] = None,
    fc2_scale: Optional[Tensor] = None,
    fc1_smooth_scale: Optional[Tensor] = None,
    fc2_smooth_scale: Optional[Tensor] = None,
    a16: bool = False,
    per_tensor_quant_scale: Optional[Tensor] = None,
    expert_mask: Optional[Tensor] = None,
    activation_method: int = value,
) -> Tensor
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_asm_moe_tkw1_fake(
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    fc1_scale: Optional[torch.Tensor] = None,
    fc2_scale: Optional[torch.Tensor] = None,
    fc1_smooth_scale: Optional[torch.Tensor] = None,
    fc2_smooth_scale: Optional[torch.Tensor] = None,
    a16: bool = False,
    per_tensor_quant_scale: Optional[torch.Tensor] = None,
    expert_mask: Optional[torch.Tensor] = None,
    activation_method: int = ActivationMethod.SILU.value,
) -> torch.Tensor:
    return torch.empty_like(hidden_states)

rocm_aiter_asm_moe_tkw1_impl

rocm_aiter_asm_moe_tkw1_impl(
    hidden_states: Tensor,
    w1: Tensor,
    w2: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    fc1_scale: Optional[Tensor] = None,
    fc2_scale: Optional[Tensor] = None,
    fc1_smooth_scale: Optional[Tensor] = None,
    fc2_smooth_scale: Optional[Tensor] = None,
    a16: bool = False,
    per_tensor_quant_scale: Optional[Tensor] = None,
    expert_mask: Optional[Tensor] = None,
    activation_method: int = value,
) -> Tensor
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_asm_moe_tkw1_impl(
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    fc1_scale: Optional[torch.Tensor] = None,
    fc2_scale: Optional[torch.Tensor] = None,
    fc1_smooth_scale: Optional[torch.Tensor] = None,
    fc2_smooth_scale: Optional[torch.Tensor] = None,
    a16: bool = False,
    per_tensor_quant_scale: Optional[torch.Tensor] = None,
    expert_mask: Optional[torch.Tensor] = None,
    activation_method: int = ActivationMethod.SILU.value,
) -> torch.Tensor:
    from aiter import ActivationType
    from aiter.fused_moe_bf16_asm import asm_moe_tkw1

    activation = ActivationType(activation_method)

    return asm_moe_tkw1(
        hidden_states,
        w1,
        w2,
        topk_weights,
        topk_ids,
        fc1_scale=fc1_scale,
        fc2_scale=fc2_scale,
        fc1_smooth_scale=fc1_smooth_scale,
        fc2_smooth_scale=fc2_smooth_scale,
        a16=a16,
        per_tensor_quant_scale=per_tensor_quant_scale,
        expert_mask=expert_mask,
        activation=activation,
    )

rocm_aiter_biased_grouped_topk_fake

rocm_aiter_biased_grouped_topk_fake(
    gating_output: Tensor,
    correction_bias: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    num_expert_group: int,
    topk_group: int,
    need_renorm: bool,
    routed_scaling_factor: float = 1.0,
) -> None
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_biased_grouped_topk_fake(
    gating_output: torch.Tensor,
    correction_bias: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    num_expert_group: int,
    topk_group: int,
    need_renorm: bool,
    routed_scaling_factor: float = 1.0,  # mul to topk_weights
) -> None:
    pass

rocm_aiter_biased_grouped_topk_impl

rocm_aiter_biased_grouped_topk_impl(
    gating_output: Tensor,
    correction_bias: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    num_expert_group: int,
    topk_group: int,
    need_renorm: bool,
    routed_scaling_factor: float = 1.0,
) -> None
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_biased_grouped_topk_impl(
    gating_output: torch.Tensor,
    correction_bias: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    num_expert_group: int,
    topk_group: int,
    need_renorm: bool,
    routed_scaling_factor: float = 1.0,  # mul to topk_weights
) -> None:
    from aiter import biased_grouped_topk

    biased_grouped_topk(
        gating_output,
        correction_bias,
        topk_weights,
        topk_ids,
        num_expert_group,
        topk_group,
        need_renorm,
        routed_scaling_factor,
    )

rocm_aiter_fused_experts

rocm_aiter_fused_experts(
    hidden_states: Tensor,
    w1: Tensor,
    w2: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    activation: str = "silu",
    apply_router_weight_on_input: bool = False,
    expert_map: Optional[Tensor] = None,
    quant_config: Optional[FusedMoEQuantConfig] = None,
) -> Tensor
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_fused_experts(
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    activation: str = "silu",
    apply_router_weight_on_input: bool = False,
    expert_map: Optional[torch.Tensor] = None,
    quant_config: Optional[FusedMoEQuantConfig] = None,
) -> torch.Tensor:
    if quant_config is None:
        quant_config = FUSED_MOE_UNQUANTIZED_CONFIG

    activation_method = (
        ActivationMethod.SILU if activation == "silu" else ActivationMethod.GELU
    )
    # All AITER Fused MoE kernels are expecting the following datatypes
    topk_weights = topk_weights.to(torch.float32)
    topk_ids = topk_ids.to(torch.int32)

    expert_mask = (expert_map > -1).to(torch.int32) if expert_map is not None else None

    # w8a8 per-channel quantization
    if (
        quant_config.per_act_token_quant
        and apply_router_weight_on_input
        and quant_config.use_fp8_w8a8
    ):
        # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
        # This applies topk_weights on the GEMM output of the first FC layer
        #  rather than the second FC.
        assert topk_weights.dim() == 2, (
            "`topk_weights` should be in shape (num_tokens, topk)"
        )
        assert topk_weights.shape[-1] == 1, (
            "Only support topk=1 when `apply_router_weight_on_input` is True"
        )

        return torch.ops.vllm.rocm_aiter_asm_moe_tkw1(
            hidden_states,
            w1,
            w2,
            topk_weights,
            topk_ids,
            fc1_scale=quant_config.w1_scale,
            fc2_scale=quant_config.w2_scale,
            fc1_smooth_scale=None,
            fc2_smooth_scale=None,
            a16=False,
            per_tensor_quant_scale=None,
            expert_mask=expert_mask,
            activation_method=activation_method,
        )

    else:
        quant_method = QuantMethod.NO.value

        # w8a8 block-scaled
        if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:
            assert not apply_router_weight_on_input, (
                "apply_router_weight_on_input is\
                not supported for block scaled moe"
            )
            assert quant_config.w1_scale is not None
            assert quant_config.w2_scale is not None
            quant_method = QuantMethod.BLOCK_128x128.value
        elif quant_config.use_fp8_w8a8:
            # Currently only per tensor quantization method is enabled.
            quant_method = QuantMethod.PER_TENSOR.value

        if apply_router_weight_on_input:
            assert topk_weights.dim() == 2, (
                "`topk_weights` should be in shape (num_tokens, topk)"
            )
            _, topk = topk_weights.shape
            assert topk == 1, (
                "Only support topk=1 when `apply_router_weight_on_input` is True"
            )

        return torch.ops.vllm.rocm_aiter_fused_moe(
            hidden_states,
            w1,
            w2,
            topk_weights,
            topk_ids,
            expert_mask=expert_mask,
            quant_method=quant_method,
            activation_method=activation_method,
            w1_scale=quant_config.w1_scale,
            w2_scale=quant_config.w2_scale,
            a1_scale=quant_config.a1_scale,
            a2_scale=quant_config.a2_scale,
            doweight_stage1=apply_router_weight_on_input,
        )

rocm_aiter_fused_moe_fake

rocm_aiter_fused_moe_fake(
    hidden_states: Tensor,
    w1: Tensor,
    w2: Tensor,
    topk_weight: Tensor,
    topk_ids: Tensor,
    expert_mask: Optional[Tensor] = None,
    activation_method: int = value,
    quant_method: int = value,
    doweight_stage1: bool = False,
    w1_scale: Optional[Tensor] = None,
    w2_scale: Optional[Tensor] = None,
    a1_scale: Optional[Tensor] = None,
    a2_scale: Optional[Tensor] = None,
) -> Tensor
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_fused_moe_fake(
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weight: torch.Tensor,
    topk_ids: torch.Tensor,
    expert_mask: Optional[torch.Tensor] = None,
    activation_method: int = ActivationMethod.SILU.value,
    quant_method: int = QuantMethod.NO.value,
    doweight_stage1: bool = False,
    w1_scale: Optional[torch.Tensor] = None,
    w2_scale: Optional[torch.Tensor] = None,
    a1_scale: Optional[torch.Tensor] = None,
    a2_scale: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    return torch.empty_like(hidden_states)

rocm_aiter_fused_moe_impl

rocm_aiter_fused_moe_impl(
    hidden_states: Tensor,
    w1: Tensor,
    w2: Tensor,
    topk_weight: Tensor,
    topk_ids: Tensor,
    expert_mask: Optional[Tensor] = None,
    activation_method: int = value,
    quant_method: int = value,
    doweight_stage1: bool = False,
    w1_scale: Optional[Tensor] = None,
    w2_scale: Optional[Tensor] = None,
    a1_scale: Optional[Tensor] = None,
    a2_scale: Optional[Tensor] = None,
) -> Tensor
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_fused_moe_impl(
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weight: torch.Tensor,
    topk_ids: torch.Tensor,
    expert_mask: Optional[torch.Tensor] = None,
    activation_method: int = ActivationMethod.SILU.value,
    quant_method: int = QuantMethod.NO.value,
    doweight_stage1: bool = False,
    w1_scale: Optional[torch.Tensor] = None,
    w2_scale: Optional[torch.Tensor] = None,
    a1_scale: Optional[torch.Tensor] = None,
    a2_scale: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    from aiter import ActivationType, QuantType
    from aiter.fused_moe import fused_moe

    activation = ActivationType(activation_method)
    quant_type = QuantType(quant_method)

    return fused_moe(
        hidden_states,
        w1,
        w2,
        topk_weight,
        topk_ids,
        expert_mask,
        activation,
        quant_type,
        doweight_stage1,
        w1_scale,
        w2_scale,
        a1_scale,
        a2_scale,
    )

rocm_aiter_grouped_topk

rocm_aiter_grouped_topk(
    hidden_states: Tensor,
    gating_output: Tensor,
    topk: int,
    renormalize: bool,
    num_expert_group: int = 0,
    topk_group: int = 0,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: Optional[Tensor] = None,
) -> tuple[Tensor, Tensor]
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_grouped_topk(
    hidden_states: torch.Tensor,
    gating_output: torch.Tensor,
    topk: int,
    renormalize: bool,
    num_expert_group: int = 0,
    topk_group: int = 0,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, torch.Tensor]:
    token = hidden_states.shape[0]
    device = hidden_states.device
    topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
    topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)

    if e_score_correction_bias is not None:
        torch.ops.vllm.rocm_aiter_biased_grouped_topk(
            gating_output,
            e_score_correction_bias.to(gating_output.dtype),
            topk_weights,
            topk_ids,
            num_expert_group,
            topk_group,
            renormalize,
        )
    else:
        assert scoring_func == "softmax" or scoring_func == "sigmoid"
        torch.ops.vllm.rocm_aiter_grouped_topk(
            gating_output,
            topk_weights,
            topk_ids,
            num_expert_group,
            topk_group,
            renormalize,
            scoring_func,
        )

    if routed_scaling_factor != 1.0:
        topk_weights = topk_weights * routed_scaling_factor
    return topk_weights, topk_ids

rocm_aiter_grouped_topk_fake

rocm_aiter_grouped_topk_fake(
    gating_output: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    num_expert_group: int,
    topk_group: int,
    need_renorm: bool,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
) -> None
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_grouped_topk_fake(
    gating_output: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    num_expert_group: int,
    topk_group: int,
    need_renorm: bool,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,  # mul to topk_weights
) -> None:
    pass

rocm_aiter_grouped_topk_impl

rocm_aiter_grouped_topk_impl(
    gating_output: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    num_expert_group: int,
    topk_group: int,
    need_renorm: bool,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
) -> None
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_grouped_topk_impl(
    gating_output: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    num_expert_group: int,
    topk_group: int,
    need_renorm: bool,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,  # mul to topk_weights
) -> None:
    from aiter import grouped_topk

    grouped_topk(
        gating_output,
        topk_weights,
        topk_ids,
        num_expert_group,
        topk_group,
        need_renorm,
        scoring_func,
        routed_scaling_factor,
    )

rocm_aiter_topk_softmax

rocm_aiter_topk_softmax(
    topk_weights: Tensor,
    topk_indices: Tensor,
    token_expert_indices: Tensor,
    gating_output: Tensor,
    renormalize: bool,
) -> tuple[Tensor, ...]
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_topk_softmax(
    topk_weights: torch.Tensor,
    topk_indices: torch.Tensor,
    token_expert_indices: torch.Tensor,
    gating_output: torch.Tensor,
    renormalize: bool,
) -> tuple[torch.Tensor, ...]:
    torch.ops.vllm.rocm_aiter_topk_softmax(
        topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
    )
    return topk_weights, topk_indices

rocm_aiter_topk_softmax_fake

rocm_aiter_topk_softmax_fake(
    topk_weights: Tensor,
    topk_indices: Tensor,
    token_expert_indices: Tensor,
    gating_output: Tensor,
    renormalize: bool,
) -> None
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_topk_softmax_fake(
    topk_weights: torch.Tensor,
    topk_indices: torch.Tensor,
    token_expert_indices: torch.Tensor,
    gating_output: torch.Tensor,
    renormalize: bool,
) -> None:
    pass

rocm_aiter_topk_softmax_impl

rocm_aiter_topk_softmax_impl(
    topk_weights: Tensor,
    topk_indices: Tensor,
    token_expert_indices: Tensor,
    gating_output: Tensor,
    renormalize: bool,
) -> None
Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def rocm_aiter_topk_softmax_impl(
    topk_weights: torch.Tensor,
    topk_indices: torch.Tensor,
    token_expert_indices: torch.Tensor,
    gating_output: torch.Tensor,
    renormalize: bool,
) -> None:
    from aiter import topk_softmax

    topk_softmax(
        topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
    )

shuffle_weights

shuffle_weights(
    *tensors: Tensor, layout: tuple[int, int] = (16, 16)
) -> tuple[Tensor, ...]

Applies shuffle_weight function from AITER to each input tensor and returns them.

Rearranges (shuffles) the input tensor/s into a specified block layout for optimized computation.

Parameters:

Name Type Description Default
*tensors Tensor

Variable number of torch.Tensor objects.

()
layout tuple[int, int]

A pair of integers specifying the block sizes used to divide the tensors during shuffling. Default is (16, 16).

(16, 16)

Returns: A Tuple of shuffled tensors.

Source code in vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
def shuffle_weights(
    *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
) -> tuple[torch.Tensor, ...]:
    """
    Applies shuffle_weight function from AITER to each
    input tensor and returns them.

    Rearranges (shuffles) the input tensor/s
    into a specified block layout for optimized computation.

    Args:
        *tensors: Variable number of torch.Tensor objects.
        layout: A pair of integers specifying the block sizes used to divide
            the tensors during shuffling. Default is (16, 16).

    Returns:
    A Tuple of shuffled tensors.
    """
    from aiter.ops.shuffle import shuffle_weight

    return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)