vllm.model_executor.models.kimi_vl ¶

KimiVLImageInputs `module-attribute` ¶

KimiVLImageInputs = KimiVLImagePixelInputs

KimiVLDummyInputsBuilder ¶

Bases: BaseDummyInputsBuilder[KimiVLProcessingInfo]

Source code in vllm/model_executor/models/kimi_vl.py

class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_images = mm_counts.get("image", 0)

        processor = self.info.get_hf_processor()
        image_token = processor.image_token

        return image_token * num_images

    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)

        image_overrides = mm_options.get("image") if mm_options else None

        return {
            "image": self._get_dummy_images(
                width=MaxImageTokenMeta.width,
                height=MaxImageTokenMeta.height,
                num_images=num_images,
                overrides=image_overrides,
            )
        }

get_dummy_mm_data ¶

get_dummy_mm_data(
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Optional[
        Mapping[str, BaseDummyOptions]
    ] = None,
) -> MultiModalDataDict

Source code in vllm/model_executor/models/kimi_vl.py

def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict:
    num_images = mm_counts.get("image", 0)

    image_overrides = mm_options.get("image") if mm_options else None

    return {
        "image": self._get_dummy_images(
            width=MaxImageTokenMeta.width,
            height=MaxImageTokenMeta.height,
            num_images=num_images,
            overrides=image_overrides,
        )
    }

get_dummy_text ¶

get_dummy_text(mm_counts: Mapping[str, int]) -> str

Source code in vllm/model_executor/models/kimi_vl.py

def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    num_images = mm_counts.get("image", 0)

    processor = self.info.get_hf_processor()
    image_token = processor.image_token

    return image_token * num_images

KimiVLForConditionalGeneration ¶

Bases: Module, SupportsMultiModal, SupportsPP

Source code in vllm/model_executor/models/kimi_vl.py

@MULTIMODAL_REGISTRY.register_processor(
    KimiVLMultiModalProcessor,
    info=KimiVLProcessingInfo,
    dummy_inputs=KimiVLDummyInputsBuilder,
)
class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
    merge_by_field_config = True

    supports_encoder_tp_data = True

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
        if modality.startswith("image"):
            return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>"

        raise ValueError("Only image modality is supported")

    def __init__(
        self,
        vllm_config: VllmConfig,
        prefix: str = "",
    ) -> None:
        super().__init__()
        model_config = vllm_config.model_config
        config: KimiVLConfig = model_config.hf_config
        self.config = config
        quant_config = vllm_config.quant_config

        assert isinstance(config.vision_config, MoonViTConfig)
        self.use_data_parallel = (
            model_config.multimodal_config.mm_encoder_tp_mode == "data"
        )
        self.hidden_size = config.text_config.hidden_size
        self.vision_tower = MoonVitPretrainedModel(
            config.vision_config,
            self.use_data_parallel,
            prefix=maybe_prefix(prefix, "vision_tower"),
        )

        self.multi_modal_projector = KimiVLMultiModalProjector(
            config=config,
            use_data_parallel=self.use_data_parallel,
            prefix=maybe_prefix(prefix, "multi_modal_projector"),
        )

        self.quant_config = quant_config
        sub_vllm_config = copy.deepcopy(vllm_config)
        sub_vllm_config.model_config.hf_config = (
            sub_vllm_config.model_config.hf_config.text_config
        )
        self.language_model = DeepseekV2Model(
            vllm_config=sub_vllm_config,
            prefix=maybe_prefix(prefix, "language_model"),
        )
        self.unpadded_vocab_size = config.text_config.vocab_size
        if get_pp_group().is_last_rank:
            self.lm_head = ParallelLMHead(
                self.unpadded_vocab_size,
                config.text_config.hidden_size,
                org_num_embeddings=self.config.text_config.vocab_size,
                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                prefix=maybe_prefix(prefix, "lm_head"),
            )
        else:
            self.lm_head = PPMissingLayer()
        self.make_empty_intermediate_tensors = (
            self.language_model.make_empty_intermediate_tensors
        )
        logit_scale = getattr(config, "logit_scale", 1.0)
        self.logits_processor = LogitsProcessor(
            self.unpadded_vocab_size, config.vocab_size, logit_scale
        )
        self.media_placeholder: int = self.config.media_placeholder_token_id

    def _parse_and_validate_image_input(
        self, **kwargs: object
    ) -> Optional[KimiVLImageInputs]:
        # image input type must be pixel values now
        pixel_values = kwargs.pop("pixel_values", None)
        image_grid_hws = kwargs.pop("image_grid_hws", None)

        if pixel_values is None:
            return None

        return KimiVLImagePixelInputs(
            type="pixel_values",
            pixel_values=pixel_values,
            image_grid_hws=image_grid_hws,
        )

    # perform vt on processored pixel_values
    @torch.inference_mode()
    def _process_image_pixels(self, inputs: KimiVLImagePixelInputs) -> torch.Tensor:
        assert self.vision_tower is not None

        pixel_values = inputs["pixel_values"]
        image_grid_hws = inputs["image_grid_hws"]
        if self.use_data_parallel:
            return run_dp_sharded_mrope_vision_model(
                self.vision_tower,
                pixel_values,
                image_grid_hws.tolist(),
                rope_type="rope_2d",
            )
        else:
            return self.vision_tower(pixel_values, image_grid_hws)

    def _process_image_input(self, image_input: KimiVLImageInputs) -> torch.Tensor:
        assert image_input["type"] == "pixel_values"
        image_features = self._process_image_pixels(image_input)
        assert isinstance(image_features, (list, tuple))
        lengths = [x.shape[0] for x in image_features]
        return self.multi_modal_projector(torch.cat(image_features)).split(lengths)

    def get_language_model(self) -> torch.nn.Module:
        return self.language_model

    def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
        # Validate the multimodal input keyword arguments
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:
            return None

        # Run multimodal inputs through encoder and projector
        vision_embeddings = self._process_image_input(image_input)
        return vision_embeddings

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs: object,
    ) -> IntermediateTensors:
        if intermediate_tensors is not None:
            inputs_embeds = None

        hidden_states = self.language_model(
            input_ids=input_ids,
            positions=positions,
            intermediate_tensors=intermediate_tensors,
            inputs_embeds=inputs_embeds,
        )

        return hidden_states

    def compute_logits(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
        logits = self.logits_processor(self.lm_head, hidden_states, **kwargs)
        return logits

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
        config = self.config.text_config
        _KEYS_TO_MODIFY_MAPPING = {
            "language_model.lm_head": "lm_head",
            "language_model.model": "language_model",
        }
        # only doing this for language model part for now.
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            (".gate_up_proj", ".gate_proj", 0),
            (".gate_up_proj", ".up_proj", 1),
        ]
        if not config.use_mla:
            stacked_params_mapping += [
                (".qkv_proj", ".q_proj", "q"),
                (".qkv_proj", ".k_proj", "k"),
                (".qkv_proj", ".v_proj", "v"),
            ]
        if getattr(config, "n_routed_experts", None):
            # Params for weights, fp8 weight scales, fp8 activation scales
            # (param_name, weight_name, expert_id, shard_id)
            expert_params_mapping = FusedMoE.make_expert_params_mapping(
                ckpt_gate_proj_name="gate_proj",
                ckpt_down_proj_name="down_proj",
                ckpt_up_proj_name="up_proj",
                num_experts=config.n_routed_experts,
            )
        else:
            expert_params_mapping = []

        params_dict = dict(self.named_parameters())

        for args in weights:
            name, loaded_weight = args[:2]
            kwargs = args[2] if len(args) > 2 else {}
            if "rotary_emb.inv_freq" in name:
                continue

            spec_layer = get_spec_layer_idx_from_weight_name(config, name)
            if spec_layer is not None:
                continue  # skip spec decode layers for main model

            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
                # Models trained using ColossalAI may include these tensors in
                # the checkpoint. Skip them.
                continue
            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                if key_to_modify in name:
                    name = name.replace(key_to_modify, new_key)
            use_default_weight_loading = False
            if "vision" in name:
                if self.vision_tower is not None:
                    # We only do sharding for language model and
                    # not vision model for now.
                    use_default_weight_loading = True
            else:
                for param_name, weight_name, shard_id in stacked_params_mapping:
                    if weight_name not in name:
                        continue
                    # We have mlp.experts[0].gate_proj in the checkpoint.
                    # Since we handle the experts below in expert_params_mapping,
                    # we need to skip here BEFORE we update the name, otherwise
                    # name will be updated to mlp.experts[0].gate_up_proj, which
                    # will then be updated below in expert_params_mapping
                    # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                    if ("mlp.experts." in name) and name not in params_dict:
                        continue
                    name = name.replace(weight_name, param_name)
                    # Skip loading extra bias for GPTQ models.
                    if name.endswith(".bias") and name not in params_dict:
                        continue

                    if is_pp_missing_parameter(name, self):
                        continue

                    param = params_dict[name]
                    weight_loader = param.weight_loader
                    weight_loader(param, loaded_weight, shard_id, **kwargs)
                    break
                else:
                    for idx, (
                        param_name,
                        weight_name,
                        expert_id,
                        shard_id,
                    ) in enumerate(expert_params_mapping):
                        if weight_name not in name:
                            continue
                        name = name.replace(weight_name, param_name)

                        if is_pp_missing_parameter(name, self):
                            continue

                        param = params_dict[name]
                        weight_loader = param.weight_loader
                        weight_loader(
                            param,
                            loaded_weight,
                            name,
                            expert_id=expert_id,
                            shard_id=shard_id,
                            **kwargs,
                        )
                        break
                    else:
                        use_default_weight_loading = True
            if use_default_weight_loading:
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                # Remapping the name of FP8 kv-scale.
                name = maybe_remap_kv_scale_name(name, params_dict)
                if name is None:
                    continue

                if is_pp_missing_parameter(name, self):
                    continue

                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                weight_loader(param, loaded_weight, **kwargs)

config `instance-attribute` ¶

config = config

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

language_model `instance-attribute` ¶

language_model = DeepseekV2Model(
    vllm_config=sub_vllm_config,
    prefix=maybe_prefix(prefix, "language_model"),
)

lm_head `instance-attribute` ¶

lm_head = ParallelLMHead(
    unpadded_vocab_size,
    hidden_size,
    org_num_embeddings=vocab_size,
    padding_size=DEFAULT_VOCAB_PADDING_SIZE,
    prefix=maybe_prefix(prefix, "lm_head"),
)

logits_processor `instance-attribute` ¶

logits_processor = LogitsProcessor(
    unpadded_vocab_size, vocab_size, logit_scale
)

make_empty_intermediate_tensors `instance-attribute` ¶

make_empty_intermediate_tensors = (
    make_empty_intermediate_tensors
)

media_placeholder `instance-attribute` ¶

media_placeholder: int = media_placeholder_token_id

merge_by_field_config `class-attribute` `instance-attribute` ¶

merge_by_field_config = True

multi_modal_projector `instance-attribute` ¶

multi_modal_projector = KimiVLMultiModalProjector(
    config=config,
    use_data_parallel=use_data_parallel,
    prefix=maybe_prefix(prefix, "multi_modal_projector"),
)

quant_config `instance-attribute` ¶

quant_config = quant_config

supports_encoder_tp_data `class-attribute` `instance-attribute` ¶

supports_encoder_tp_data = True

unpadded_vocab_size `instance-attribute` ¶

unpadded_vocab_size = vocab_size

use_data_parallel `instance-attribute` ¶

use_data_parallel = mm_encoder_tp_mode == 'data'

vision_tower `instance-attribute` ¶

vision_tower = MoonVitPretrainedModel(
    vision_config,
    use_data_parallel,
    prefix=maybe_prefix(prefix, "vision_tower"),
)

init ¶

__init__(vllm_config: VllmConfig, prefix: str = '') -> None

Source code in vllm/model_executor/models/kimi_vl.py

def __init__(
    self,
    vllm_config: VllmConfig,
    prefix: str = "",
) -> None:
    super().__init__()
    model_config = vllm_config.model_config
    config: KimiVLConfig = model_config.hf_config
    self.config = config
    quant_config = vllm_config.quant_config

    assert isinstance(config.vision_config, MoonViTConfig)
    self.use_data_parallel = (
        model_config.multimodal_config.mm_encoder_tp_mode == "data"
    )
    self.hidden_size = config.text_config.hidden_size
    self.vision_tower = MoonVitPretrainedModel(
        config.vision_config,
        self.use_data_parallel,
        prefix=maybe_prefix(prefix, "vision_tower"),
    )

    self.multi_modal_projector = KimiVLMultiModalProjector(
        config=config,
        use_data_parallel=self.use_data_parallel,
        prefix=maybe_prefix(prefix, "multi_modal_projector"),
    )

    self.quant_config = quant_config
    sub_vllm_config = copy.deepcopy(vllm_config)
    sub_vllm_config.model_config.hf_config = (
        sub_vllm_config.model_config.hf_config.text_config
    )
    self.language_model = DeepseekV2Model(
        vllm_config=sub_vllm_config,
        prefix=maybe_prefix(prefix, "language_model"),
    )
    self.unpadded_vocab_size = config.text_config.vocab_size
    if get_pp_group().is_last_rank:
        self.lm_head = ParallelLMHead(
            self.unpadded_vocab_size,
            config.text_config.hidden_size,
            org_num_embeddings=self.config.text_config.vocab_size,
            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
            prefix=maybe_prefix(prefix, "lm_head"),
        )
    else:
        self.lm_head = PPMissingLayer()
    self.make_empty_intermediate_tensors = (
        self.language_model.make_empty_intermediate_tensors
    )
    logit_scale = getattr(config, "logit_scale", 1.0)
    self.logits_processor = LogitsProcessor(
        self.unpadded_vocab_size, config.vocab_size, logit_scale
    )
    self.media_placeholder: int = self.config.media_placeholder_token_id

_parse_and_validate_image_input ¶

_parse_and_validate_image_input(
    **kwargs: object,
) -> Optional[KimiVLImageInputs]

Source code in vllm/model_executor/models/kimi_vl.py

def _parse_and_validate_image_input(
    self, **kwargs: object
) -> Optional[KimiVLImageInputs]:
    # image input type must be pixel values now
    pixel_values = kwargs.pop("pixel_values", None)
    image_grid_hws = kwargs.pop("image_grid_hws", None)

    if pixel_values is None:
        return None

    return KimiVLImagePixelInputs(
        type="pixel_values",
        pixel_values=pixel_values,
        image_grid_hws=image_grid_hws,
    )

_process_image_input ¶

_process_image_input(
    image_input: KimiVLImageInputs,
) -> Tensor

Source code in vllm/model_executor/models/kimi_vl.py

def _process_image_input(self, image_input: KimiVLImageInputs) -> torch.Tensor:
    assert image_input["type"] == "pixel_values"
    image_features = self._process_image_pixels(image_input)
    assert isinstance(image_features, (list, tuple))
    lengths = [x.shape[0] for x in image_features]
    return self.multi_modal_projector(torch.cat(image_features)).split(lengths)

_process_image_pixels ¶

_process_image_pixels(
    inputs: KimiVLImagePixelInputs,
) -> Tensor

Source code in vllm/model_executor/models/kimi_vl.py

@torch.inference_mode()
def _process_image_pixels(self, inputs: KimiVLImagePixelInputs) -> torch.Tensor:
    assert self.vision_tower is not None

    pixel_values = inputs["pixel_values"]
    image_grid_hws = inputs["image_grid_hws"]
    if self.use_data_parallel:
        return run_dp_sharded_mrope_vision_model(
            self.vision_tower,
            pixel_values,
            image_grid_hws.tolist(),
            rope_type="rope_2d",
        )
    else:
        return self.vision_tower(pixel_values, image_grid_hws)

compute_logits ¶

compute_logits(hidden_states: Tensor, **kwargs) -> Tensor

Source code in vllm/model_executor/models/kimi_vl.py

def compute_logits(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
    logits = self.logits_processor(self.lm_head, hidden_states, **kwargs)
    return logits

forward ¶

forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    inputs_embeds: Optional[Tensor] = None,
    **kwargs: object,
) -> IntermediateTensors

Source code in vllm/model_executor/models/kimi_vl.py

def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    **kwargs: object,
) -> IntermediateTensors:
    if intermediate_tensors is not None:
        inputs_embeds = None

    hidden_states = self.language_model(
        input_ids=input_ids,
        positions=positions,
        intermediate_tensors=intermediate_tensors,
        inputs_embeds=inputs_embeds,
    )

    return hidden_states

get_language_model ¶

get_language_model() -> Module

Source code in vllm/model_executor/models/kimi_vl.py

def get_language_model(self) -> torch.nn.Module:
    return self.language_model

get_multimodal_embeddings ¶

get_multimodal_embeddings(
    **kwargs: object,
) -> Optional[NestedTensors]

Source code in vllm/model_executor/models/kimi_vl.py

def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
    # Validate the multimodal input keyword arguments
    image_input = self._parse_and_validate_image_input(**kwargs)
    if image_input is None:
        return None

    # Run multimodal inputs through encoder and projector
    vision_embeddings = self._process_image_input(image_input)
    return vision_embeddings

get_placeholder_str `classmethod` ¶

get_placeholder_str(modality: str, i: int) -> Optional[str]

Source code in vllm/model_executor/models/kimi_vl.py

@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
    if modality.startswith("image"):
        return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>"

    raise ValueError("Only image modality is supported")

load_weights ¶

load_weights(weights: Iterable[tuple[str, Tensor]])

Source code in vllm/model_executor/models/kimi_vl.py

def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
    config = self.config.text_config
    _KEYS_TO_MODIFY_MAPPING = {
        "language_model.lm_head": "lm_head",
        "language_model.model": "language_model",
    }
    # only doing this for language model part for now.
    stacked_params_mapping = [
        # (param_name, shard_name, shard_id)
        (".gate_up_proj", ".gate_proj", 0),
        (".gate_up_proj", ".up_proj", 1),
    ]
    if not config.use_mla:
        stacked_params_mapping += [
            (".qkv_proj", ".q_proj", "q"),
            (".qkv_proj", ".k_proj", "k"),
            (".qkv_proj", ".v_proj", "v"),
        ]
    if getattr(config, "n_routed_experts", None):
        # Params for weights, fp8 weight scales, fp8 activation scales
        # (param_name, weight_name, expert_id, shard_id)
        expert_params_mapping = FusedMoE.make_expert_params_mapping(
            ckpt_gate_proj_name="gate_proj",
            ckpt_down_proj_name="down_proj",
            ckpt_up_proj_name="up_proj",
            num_experts=config.n_routed_experts,
        )
    else:
        expert_params_mapping = []

    params_dict = dict(self.named_parameters())

    for args in weights:
        name, loaded_weight = args[:2]
        kwargs = args[2] if len(args) > 2 else {}
        if "rotary_emb.inv_freq" in name:
            continue

        spec_layer = get_spec_layer_idx_from_weight_name(config, name)
        if spec_layer is not None:
            continue  # skip spec decode layers for main model

        if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
            # Models trained using ColossalAI may include these tensors in
            # the checkpoint. Skip them.
            continue
        for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
            if key_to_modify in name:
                name = name.replace(key_to_modify, new_key)
        use_default_weight_loading = False
        if "vision" in name:
            if self.vision_tower is not None:
                # We only do sharding for language model and
                # not vision model for now.
                use_default_weight_loading = True
        else:
            for param_name, weight_name, shard_id in stacked_params_mapping:
                if weight_name not in name:
                    continue
                # We have mlp.experts[0].gate_proj in the checkpoint.
                # Since we handle the experts below in expert_params_mapping,
                # we need to skip here BEFORE we update the name, otherwise
                # name will be updated to mlp.experts[0].gate_up_proj, which
                # will then be updated below in expert_params_mapping
                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                if ("mlp.experts." in name) and name not in params_dict:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue

                if is_pp_missing_parameter(name, self):
                    continue

                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id, **kwargs)
                break
            else:
                for idx, (
                    param_name,
                    weight_name,
                    expert_id,
                    shard_id,
                ) in enumerate(expert_params_mapping):
                    if weight_name not in name:
                        continue
                    name = name.replace(weight_name, param_name)

                    if is_pp_missing_parameter(name, self):
                        continue

                    param = params_dict[name]
                    weight_loader = param.weight_loader
                    weight_loader(
                        param,
                        loaded_weight,
                        name,
                        expert_id=expert_id,
                        shard_id=shard_id,
                        **kwargs,
                    )
                    break
                else:
                    use_default_weight_loading = True
        if use_default_weight_loading:
            # Skip loading extra bias for GPTQ models.
            if name.endswith(".bias") and name not in params_dict:
                continue
            # Remapping the name of FP8 kv-scale.
            name = maybe_remap_kv_scale_name(name, params_dict)
            if name is None:
                continue

            if is_pp_missing_parameter(name, self):
                continue

            param = params_dict[name]
            weight_loader = getattr(param, "weight_loader", default_weight_loader)
            weight_loader(param, loaded_weight, **kwargs)

KimiVLImagePixelInputs ¶

Bases: TensorSchema

Dimensions

nc: Number of channels
np: Number of patches
ps: Patch size
ni: Number of images

Source code in vllm/model_executor/models/kimi_vl.py

class KimiVLImagePixelInputs(TensorSchema):
    """
    Dimensions:
        - nc: Number of channels
        - np: Number of patches
        - ps: Patch size
        - ni: Number of images
    """

    type: Literal["pixel_values"] = "pixel_values"

    pixel_values: Annotated[
        Union[torch.Tensor, list[torch.Tensor]],
        TensorShape("np", 3, "ps", "ps"),
    ]

    image_grid_hws: Annotated[torch.Tensor, TensorShape("ni", 2)]

image_grid_hws `instance-attribute` ¶

image_grid_hws: Annotated[Tensor, TensorShape(ni, 2)]

pixel_values `instance-attribute` ¶

pixel_values: Annotated[
    Union[Tensor, list[Tensor]], TensorShape(np, 3, ps, ps)
]

type `class-attribute` `instance-attribute` ¶

type: Literal['pixel_values'] = 'pixel_values'

KimiVLMultiModalProcessor ¶

Bases: BaseMultiModalProcessor[KimiVLProcessingInfo]

Source code in vllm/model_executor/models/kimi_vl.py

class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        image_grid_hws = hf_inputs.get("image_grid_hws", torch.empty((0, 2)))
        image_grid_sizes = image_grid_hws.prod(-1)

        # pixel_values is merged as a single large tensor
        # image_grid_hws is shapes for each subtensor in pixel_values
        return dict(
            pixel_values=MultiModalFieldConfig.flat_from_sizes(
                "image", image_grid_sizes
            ),
            image_grid_hws=MultiModalFieldConfig.batched("image"),
        )

    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, Any],
        out_mm_kwargs: MultiModalKwargsItems,
    ) -> Sequence[PromptUpdate]:
        image_token_id = self.info.image_token_id

        def get_replacement(item_idx: int):
            images = mm_items.get_items(
                "image", (ImageEmbeddingItems, ImageProcessorItems)
            )

            if isinstance(images, ImageEmbeddingItems):
                num_image_tokens = images.get_feature_size(item_idx)
            else:
                image_size = images.get_image_size(item_idx)
                num_image_tokens = self.info.get_num_image_tokens(
                    image_width=image_size.width,
                    image_height=image_size.height,
                )

            return [image_token_id] * num_image_tokens

        return [
            PromptReplacement(
                modality="image",
                target=[image_token_id],
                replacement=get_replacement,
            ),
        ]

_get_mm_fields_config ¶

_get_mm_fields_config(
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]

Source code in vllm/model_executor/models/kimi_vl.py

def _get_mm_fields_config(
    self,
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
    image_grid_hws = hf_inputs.get("image_grid_hws", torch.empty((0, 2)))
    image_grid_sizes = image_grid_hws.prod(-1)

    # pixel_values is merged as a single large tensor
    # image_grid_hws is shapes for each subtensor in pixel_values
    return dict(
        pixel_values=MultiModalFieldConfig.flat_from_sizes(
            "image", image_grid_sizes
        ),
        image_grid_hws=MultiModalFieldConfig.batched("image"),
    )

_get_prompt_updates ¶

_get_prompt_updates(
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, Any],
    out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]

Source code in vllm/model_executor/models/kimi_vl.py

def _get_prompt_updates(
    self,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, Any],
    out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
    image_token_id = self.info.image_token_id

    def get_replacement(item_idx: int):
        images = mm_items.get_items(
            "image", (ImageEmbeddingItems, ImageProcessorItems)
        )

        if isinstance(images, ImageEmbeddingItems):
            num_image_tokens = images.get_feature_size(item_idx)
        else:
            image_size = images.get_image_size(item_idx)
            num_image_tokens = self.info.get_num_image_tokens(
                image_width=image_size.width,
                image_height=image_size.height,
            )

        return [image_token_id] * num_image_tokens

    return [
        PromptReplacement(
            modality="image",
            target=[image_token_id],
            replacement=get_replacement,
        ),
    ]

KimiVLMultiModalProjector ¶

Bases: Module

Source code in vllm/model_executor/models/kimi_vl.py

class KimiVLMultiModalProjector(nn.Module):
    def __init__(
        self, config: KimiVLConfig, use_data_parallel: bool = False, prefix: str = ""
    ):
        super().__init__()
        self.use_data_parallel = use_data_parallel

        self.hidden_size = (
            config.vision_config.hidden_size
            * config.vision_config.merge_kernel_size[0]
            * config.vision_config.merge_kernel_size[1]
        )

        self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size, eps=1e-5)
        self.linear_1 = ReplicatedLinear(
            self.hidden_size,
            self.hidden_size,
            bias=True,
            prefix=maybe_prefix(prefix, "linear_1"),
        )
        self.linear_2 = ReplicatedLinear(
            self.hidden_size,
            config.text_config.hidden_size,
            bias=True,
            prefix=maybe_prefix(prefix, "linear_2"),
        )
        self.act = GELUActivation()

    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
        hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
        hidden_states, _ = self.linear_1(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states, _ = self.linear_2(hidden_states)
        return hidden_states

act `instance-attribute` ¶

act = GELUActivation()

hidden_size `instance-attribute` ¶

hidden_size = (
    hidden_size
    * merge_kernel_size[0]
    * merge_kernel_size[1]
)

linear_1 `instance-attribute` ¶

linear_1 = ReplicatedLinear(
    hidden_size,
    hidden_size,
    bias=True,
    prefix=maybe_prefix(prefix, "linear_1"),
)

linear_2 `instance-attribute` ¶

linear_2 = ReplicatedLinear(
    hidden_size,
    hidden_size,
    bias=True,
    prefix=maybe_prefix(prefix, "linear_2"),
)

pre_norm `instance-attribute` ¶

pre_norm = LayerNorm(hidden_size, eps=1e-05)

use_data_parallel `instance-attribute` ¶

use_data_parallel = use_data_parallel

init ¶

__init__(
    config: KimiVLConfig,
    use_data_parallel: bool = False,
    prefix: str = "",
)

Source code in vllm/model_executor/models/kimi_vl.py

def __init__(
    self, config: KimiVLConfig, use_data_parallel: bool = False, prefix: str = ""
):
    super().__init__()
    self.use_data_parallel = use_data_parallel

    self.hidden_size = (
        config.vision_config.hidden_size
        * config.vision_config.merge_kernel_size[0]
        * config.vision_config.merge_kernel_size[1]
    )

    self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size, eps=1e-5)
    self.linear_1 = ReplicatedLinear(
        self.hidden_size,
        self.hidden_size,
        bias=True,
        prefix=maybe_prefix(prefix, "linear_1"),
    )
    self.linear_2 = ReplicatedLinear(
        self.hidden_size,
        config.text_config.hidden_size,
        bias=True,
        prefix=maybe_prefix(prefix, "linear_2"),
    )
    self.act = GELUActivation()

forward ¶

forward(image_features: Tensor) -> Tensor

Source code in vllm/model_executor/models/kimi_vl.py

def forward(self, image_features: torch.Tensor) -> torch.Tensor:
    hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
    hidden_states, _ = self.linear_1(hidden_states)
    hidden_states = self.act(hidden_states)
    hidden_states, _ = self.linear_2(hidden_states)
    return hidden_states

KimiVLProcessingInfo ¶

Bases: BaseProcessingInfo

Source code in vllm/model_executor/models/kimi_vl.py

class KimiVLProcessingInfo(BaseProcessingInfo):
    def get_hf_config(self):
        return self.ctx.get_hf_config(KimiVLConfig)

    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

    def get_num_image_tokens(
        self,
        *,
        image_width: int,
        image_height: int,
    ) -> int:
        hf_processor = self.get_hf_processor()
        patch_size = hf_processor.image_processor.patch_size
        kernel_size = hf_processor.image_processor.merge_kernel_size
        in_token_limit = hf_processor.image_processor.in_token_limit
        height = image_height
        width = image_width
        assert isinstance(height, int), f"height must be int, current height {height}"
        assert isinstance(width, int), f"width must be int, current width {width}"
        assert kernel_size is not None, "kernel_size must be specified"

        if (width // patch_size) * (height // patch_size) > in_token_limit:
            scale = math.sqrt(
                in_token_limit / ((width // patch_size) * (height // patch_size))
            )
            new_w, new_h = int(width * scale), int(height * scale)
            width, height = new_w, new_h

        kernel_height, kernel_width = kernel_size

        pad_height = (
            kernel_height * patch_size - height % (kernel_height * patch_size)
        ) % (kernel_height * patch_size)
        pad_width = (
            kernel_width * patch_size - width % (kernel_width * patch_size)
        ) % (kernel_width * patch_size)

        # Calculate new dimensions after padding and patching
        token_height = (height + pad_height) // (kernel_size[0] * patch_size)
        token_width = (width + pad_width) // (kernel_size[1] * patch_size)
        return int(token_height * token_width)

    @property
    def image_token_id(self) -> int:
        return self.get_hf_config().media_placeholder_token_id

image_token_id `property` ¶

image_token_id: int

get_hf_config ¶

get_hf_config()

Source code in vllm/model_executor/models/kimi_vl.py

def get_hf_config(self):
    return self.ctx.get_hf_config(KimiVLConfig)

get_num_image_tokens ¶

get_num_image_tokens(
    *, image_width: int, image_height: int
) -> int

Source code in vllm/model_executor/models/kimi_vl.py

def get_num_image_tokens(
    self,
    *,
    image_width: int,
    image_height: int,
) -> int:
    hf_processor = self.get_hf_processor()
    patch_size = hf_processor.image_processor.patch_size
    kernel_size = hf_processor.image_processor.merge_kernel_size
    in_token_limit = hf_processor.image_processor.in_token_limit
    height = image_height
    width = image_width
    assert isinstance(height, int), f"height must be int, current height {height}"
    assert isinstance(width, int), f"width must be int, current width {width}"
    assert kernel_size is not None, "kernel_size must be specified"

    if (width // patch_size) * (height // patch_size) > in_token_limit:
        scale = math.sqrt(
            in_token_limit / ((width // patch_size) * (height // patch_size))
        )
        new_w, new_h = int(width * scale), int(height * scale)
        width, height = new_w, new_h

    kernel_height, kernel_width = kernel_size

    pad_height = (
        kernel_height * patch_size - height % (kernel_height * patch_size)
    ) % (kernel_height * patch_size)
    pad_width = (
        kernel_width * patch_size - width % (kernel_width * patch_size)
    ) % (kernel_width * patch_size)

    # Calculate new dimensions after padding and patching
    token_height = (height + pad_height) // (kernel_size[0] * patch_size)
    token_width = (width + pad_width) // (kernel_size[1] * patch_size)
    return int(token_height * token_width)

get_supported_mm_limits ¶

get_supported_mm_limits() -> Mapping[str, Optional[int]]

Source code in vllm/model_executor/models/kimi_vl.py

def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
    return {"image": None}

MaxImageTokenMeta `dataclass` ¶

Source code in vllm/model_executor/models/kimi_vl.py

@dataclass
class MaxImageTokenMeta:
    width: int = 1024
    height: int = 1024

height `class-attribute` `instance-attribute` ¶

height: int = 1024

width `class-attribute` `instance-attribute` ¶

width: int = 1024

init ¶

__init__(width: int = 1024, height: int = 1024) -> None

get_spec_layer_idx_from_weight_name ¶

get_spec_layer_idx_from_weight_name(
    config: DeepseekV2Config, weight_name: str
) -> Optional[int]

Source code in vllm/model_executor/models/kimi_vl.py

def get_spec_layer_idx_from_weight_name(
    config: DeepseekV2Config, weight_name: str
) -> Optional[int]:
    if hasattr(config, "num_nextn_predict_layers") and (
        config.num_nextn_predict_layers > 0
    ):
        layer_idx = config.num_hidden_layers
        for i in range(config.num_nextn_predict_layers):
            if weight_name.startswith(f"model.layers.{layer_idx + i}."):
                return layer_idx + i
    return None

vllm.model_executor.models.kimi_vl ¶

KimiVLImageInputs module-attribute ¶

KimiVLDummyInputsBuilder ¶

get_dummy_mm_data ¶

get_dummy_text ¶

KimiVLForConditionalGeneration ¶

config instance-attribute ¶

hidden_size instance-attribute ¶

language_model instance-attribute ¶

lm_head instance-attribute ¶

logits_processor instance-attribute ¶

make_empty_intermediate_tensors instance-attribute ¶

media_placeholder instance-attribute ¶

merge_by_field_config class-attribute instance-attribute ¶

multi_modal_projector instance-attribute ¶

quant_config instance-attribute ¶

supports_encoder_tp_data class-attribute instance-attribute ¶

unpadded_vocab_size instance-attribute ¶

use_data_parallel instance-attribute ¶

vision_tower instance-attribute ¶

__init__ ¶

_parse_and_validate_image_input ¶

_process_image_input ¶

_process_image_pixels ¶

compute_logits ¶

forward ¶

get_language_model ¶

get_multimodal_embeddings ¶

get_placeholder_str classmethod ¶

load_weights ¶

KimiVLImagePixelInputs ¶

image_grid_hws instance-attribute ¶

pixel_values instance-attribute ¶

type class-attribute instance-attribute ¶

KimiVLMultiModalProcessor ¶

_get_mm_fields_config ¶

_get_prompt_updates ¶

KimiVLMultiModalProjector ¶

act instance-attribute ¶

hidden_size instance-attribute ¶

linear_1 instance-attribute ¶

linear_2 instance-attribute ¶

pre_norm instance-attribute ¶

use_data_parallel instance-attribute ¶

__init__ ¶

forward ¶

KimiVLProcessingInfo ¶

image_token_id property ¶

get_hf_config ¶

get_num_image_tokens ¶

get_supported_mm_limits ¶

MaxImageTokenMeta dataclass ¶

height class-attribute instance-attribute ¶

width class-attribute instance-attribute ¶

__init__ ¶

get_spec_layer_idx_from_weight_name ¶

KimiVLImageInputs `module-attribute` ¶

config `instance-attribute` ¶

hidden_size `instance-attribute` ¶

language_model `instance-attribute` ¶

lm_head `instance-attribute` ¶

logits_processor `instance-attribute` ¶

make_empty_intermediate_tensors `instance-attribute` ¶

media_placeholder `instance-attribute` ¶

merge_by_field_config `class-attribute` `instance-attribute` ¶

multi_modal_projector `instance-attribute` ¶

quant_config `instance-attribute` ¶

supports_encoder_tp_data `class-attribute` `instance-attribute` ¶

unpadded_vocab_size `instance-attribute` ¶

use_data_parallel `instance-attribute` ¶

vision_tower `instance-attribute` ¶

init ¶

get_placeholder_str `classmethod` ¶

image_grid_hws `instance-attribute` ¶

pixel_values `instance-attribute` ¶

type `class-attribute` `instance-attribute` ¶

act `instance-attribute` ¶

hidden_size `instance-attribute` ¶

linear_1 `instance-attribute` ¶

linear_2 `instance-attribute` ¶

pre_norm `instance-attribute` ¶

use_data_parallel `instance-attribute` ¶

init ¶

image_token_id `property` ¶

MaxImageTokenMeta `dataclass` ¶

height `class-attribute` `instance-attribute` ¶

width `class-attribute` `instance-attribute` ¶

init ¶