
    cCi                     P    S SK Jr  SSKJr   " S S\5      r " S S\5      rSS/rg)	   )PretrainedConfig   )Qwen2Configc                      ^  \ rS rSrSrSr                 SS\S\S\S\S\S	\S
\S\S\S\S\4U 4S jjjr	Sr
U =r$ )Ovis2VisionConfig   a
  
This is the configuration class to store the configuration of a [`Ovis2VisionModel`]. It is used to instantiate a
Ovis2VisionModel model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of Ovis2.

Args:
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2816):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 14):
        The size (resolution) of each patch.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the RMSNorm layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    qkv_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a learnable bias to the query, key, and value sequences at each attention head.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a learnable bias to the MLP layers.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    vocab_size (`int`, *optional*, defaults to 16384):
        Vocabulary size of the Vision Transformer.
    hidden_stride (`int`, *optional*, defaults to 1):
        The stride of the hidden layer in the Vision Transformer.
    num_visual_indicator_tokens (`int`, *optional*, defaults to 5):
        Number of visual indicator tokens.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated normal initializer for initializing all weight matrices.
    tokenize_function (`str`, *optional*, defaults to `"softmax"`):
        The function used to tokenize the visual indicator tokens.
vision_confighidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_sizerms_norm_epsattention_dropoutqkv_biasmlp_biasc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        Xpl        X`l        Xl	        Xl
        Xl        Xl        Xl        Xl        Xl        Xl        UU l        UU l        g )N )super__init__r
   r   r   r   r   r   r   r   
hidden_actr   r   r   
vocab_sizehidden_stridenum_visual_indicator_tokenstokenize_functioninitializer_range)selfr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                      g/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/ovis2/configuration_ovis2.pyr   Ovis2VisionConfig.__init__B   sy    * 	"6"&!2!2#6 ($$!2$  ($*+F(!2!2    )r   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )i   i         r         gh㈵>g        FFsilui @        g{Gz?softmax)__name__
__module____qualname____firstlineno____doc__base_config_keyintfloatboolr   __static_attributes____classcell__r!   s   @r"   r   r      s    )V &O  !%!##$"#&$%#%(3(3 (3 	(3
 !(3 (3 (3 (3 (3 !(3 (3 (3 (3r$   r   c                   N   ^  \ rS rSrSrSr\\S.rSSS/ SQSS	4U 4S
 jjr	Sr
U =r$ )Ovis2Configm   a  
This is the configuration class to store the configuration of a [`Ovis2ForConditionalGeneration`]. It is used to instantiate a
Ovis2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of Ovis2.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

e.g. [thisisiron/Ovis2-1B-hf](https://huggingface.co/thisisiron/Ovis2-1B-hf)

Args:
    vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `Ovis2VisionConfig`):
        The config object or dictionary of the vision backbone.
    text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
        The config object or dictionary of the text backbone.
    image_token_id (`int`, *optional*, defaults to 151665):
        The image token id to encode the image prompt.
    visual_indicator_token_ids (`List[int]`, *optional*, defaults to `[151666, 151667, 151668, 151669, 151670]`):
        The visual indicator token ids to encode the image prompt.
    vocab_size (`int`, *optional*, defaults to 151643):
        Vocabulary size of the text model.
    hidden_size (`int`, *optional*, defaults to 1536):
        Dimensionality of the encoder layers and the pooler layer.

```python
>>> from transformers import Ovis2ForConditionalGeneration, Ovis2Config

>>> # Initializing a Ovis2 style configuration
>>> configuration = Ovis2Config()

>>> # Initializing a model from the Ovis2-2B style configuration
>>> model = Ovis2ForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
ovis2)text_configr	   NiqP )irP isP itP iuP ivP i[P i   c                   > [        U[        5      (       a  [        S0 UD6U l        O[        U[        5      (       a  Xl        Uc  [        [	        U5      S9U l        [        U[        5      (       a  [        S0 UD6U l        O.[        U[
        5      (       a  X l        OUc  [        5       U l        XPl        X`l        X0l	        X@l
        [        TU ]0  " S0 UD6  g )N)r   r   )
isinstancedictr   r	   lenr   r=   r   r
   image_token_idvisual_indicator_token_idsr   r   )	r   r	   r=   rB   rC   r   r
   r    r!   s	           r"   r   Ovis2Config.__init__   s     mT**!2!C]!CD'899!. !2sSmOn!oDk4((*9[9D[11* *}D$&,*D'"6"r$   )r
   rB   r=   r	   rC   r   )r-   r.   r/   r0   r1   
model_typer   r   sub_configsr   r6   r7   r8   s   @r"   r:   r:   m   s9    $L J"-@QRK #K# #r$   r:   N)configuration_utilsr   qwen2.configuration_qwen2r   r   r:   __all__r   r$   r"   <module>rJ      s9     4 3V3( V3rF#" F#R 
.r$   