mirror of
				https://github.com/Wan-Video/Wan2.1.git
				synced 2025-11-04 06:15:17 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			138 lines
		
	
	
		
			5.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			138 lines
		
	
	
		
			5.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding=utf-8
 | 
						|
# Copyright 2023 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
 | 
						|
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
# you may not use this file except in compliance with the License.
 | 
						|
# You may obtain a copy of the License at
 | 
						|
#
 | 
						|
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
#
 | 
						|
# Unless required by applicable law or agreed to in writing, software
 | 
						|
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
# See the License for the specific language governing permissions and
 | 
						|
# limitations under the License.
 | 
						|
"""Llava model configuration"""
 | 
						|
 | 
						|
from transformers.configuration_utils import PretrainedConfig
 | 
						|
from transformers.utils import logging
 | 
						|
from transformers.models.auto import CONFIG_MAPPING, AutoConfig
 | 
						|
 | 
						|
 | 
						|
logger = logging.get_logger(__name__)
 | 
						|
 | 
						|
 | 
						|
class LlavaConfig(PretrainedConfig):
 | 
						|
    r"""
 | 
						|
    This is the configuration class to store the configuration of a [`LlavaForConditionalGeneration`]. It is used to instantiate an
 | 
						|
    Llava model according to the specified arguments, defining the model architecture. Instantiating a configuration
 | 
						|
    with the defaults will yield a similar configuration to that of the Llava-9B.
 | 
						|
 | 
						|
    e.g. [llava-hf/llava-9b](https://huggingface.co/llava-hf/llava-9b)
 | 
						|
 | 
						|
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 | 
						|
    documentation from [`PretrainedConfig`] for more information.
 | 
						|
 | 
						|
    Args:
 | 
						|
        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
 | 
						|
            The config object or dictionary of the vision backbone.
 | 
						|
        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
 | 
						|
            The config object or dictionary of the text backbone.
 | 
						|
        image_token_index (`int`, *optional*, defaults to 32000):
 | 
						|
            The image token index to encode the image prompt.
 | 
						|
        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
 | 
						|
            The activation function used by the multimodal projector.
 | 
						|
        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
 | 
						|
            The feature selection strategy used to select the vision feature from the vision backbone.
 | 
						|
            Can be one of `"default"` or `"full"`.
 | 
						|
        vision_feature_layer (`Union[int, List[int]]`, *optional*, defaults to -2):
 | 
						|
            The index of the layer to select the vision feature. If multiple indices are provided,
 | 
						|
            the vision feature of the corresponding indices will be concatenated to form the
 | 
						|
            vision features.
 | 
						|
        image_seq_length (`int`, *optional*, defaults to 576):
 | 
						|
            Sequence length of one image embedding.
 | 
						|
        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
 | 
						|
            Whether to use bias in the multimodal projector.
 | 
						|
 | 
						|
    Example:
 | 
						|
 | 
						|
    ```python
 | 
						|
    >>> from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig
 | 
						|
 | 
						|
    >>> # Initializing a CLIP-vision config
 | 
						|
    >>> vision_config = CLIPVisionConfig()
 | 
						|
 | 
						|
    >>> # Initializing a Llama config
 | 
						|
    >>> text_config = LlamaConfig()
 | 
						|
 | 
						|
    >>> # Initializing a Llava llava-1.5-7b style configuration
 | 
						|
    >>> configuration = LlavaConfig(vision_config, text_config)
 | 
						|
 | 
						|
    >>> # Initializing a model from the llava-1.5-7b style configuration
 | 
						|
    >>> model = LlavaForConditionalGeneration(configuration)
 | 
						|
 | 
						|
    >>> # Accessing the model configuration
 | 
						|
    >>> configuration = model.config
 | 
						|
    ```"""
 | 
						|
 | 
						|
    model_type = "llava"
 | 
						|
    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
 | 
						|
    is_composition = True
 | 
						|
 | 
						|
    def __init__(
 | 
						|
        self,
 | 
						|
        vision_config=None,
 | 
						|
        text_config=None,
 | 
						|
        image_token_index=32000,
 | 
						|
        projector_hidden_act="gelu",
 | 
						|
        vision_feature_select_strategy="default",
 | 
						|
        vision_feature_layer=-2,
 | 
						|
        image_seq_length=576,
 | 
						|
        multimodal_projector_bias=True,
 | 
						|
        **kwargs,
 | 
						|
    ):
 | 
						|
        self.image_token_index = image_token_index
 | 
						|
        self.projector_hidden_act = projector_hidden_act
 | 
						|
        self.image_seq_length = image_seq_length
 | 
						|
 | 
						|
        if vision_feature_select_strategy not in ["default", "full"]:
 | 
						|
            raise ValueError(
 | 
						|
                "vision_feature_select_strategy should be one of 'default', 'full'."
 | 
						|
                f"Got: {vision_feature_select_strategy}"
 | 
						|
            )
 | 
						|
 | 
						|
        self.vision_feature_select_strategy = vision_feature_select_strategy
 | 
						|
        self.vision_feature_layer = vision_feature_layer
 | 
						|
 | 
						|
        if isinstance(vision_config, dict):
 | 
						|
            vision_config["model_type"] = (
 | 
						|
                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
 | 
						|
            )
 | 
						|
            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
 | 
						|
        elif vision_config is None:
 | 
						|
            vision_config = CONFIG_MAPPING["clip_vision_model"](
 | 
						|
                intermediate_size=4096,
 | 
						|
                hidden_size=1024,
 | 
						|
                patch_size=14,
 | 
						|
                image_size=336,
 | 
						|
                num_hidden_layers=24,
 | 
						|
                num_attention_heads=16,
 | 
						|
                vocab_size=32000,
 | 
						|
                projection_dim=768,
 | 
						|
            )
 | 
						|
 | 
						|
        self.vision_config = vision_config
 | 
						|
 | 
						|
        if isinstance(text_config, dict):
 | 
						|
            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
 | 
						|
            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
 | 
						|
        elif text_config is None:
 | 
						|
            text_config = CONFIG_MAPPING["llama"]()
 | 
						|
 | 
						|
        self.text_config = text_config
 | 
						|
        self.multimodal_projector_bias = multimodal_projector_bias
 | 
						|
 | 
						|
        super().__init__(**kwargs)
 | 
						|
 | 
						|
 | 
						|
__all__ = ["LlavaConfig"]
 |