
    cCixP                         S r SSKJr  SSKJr  SSKJrJr  \R                  " \	5      r
 " S S\5      r " S S	\5      r " S
 S\5      r " S S\5      r " S S\5      r/ SQrg)zSAM2 model configuration   )PretrainedConfig)logging   )CONFIG_MAPPING
AutoConfigc                   ^   ^  \ rS rSrSrSrSr                   SU 4S jjrSrU =r	$ )Sam2HieraDetConfig   a\  
This is the configuration class to store the configuration of a [`Sam2HieraDetModel`]. It is used to instantiate
a HieraDet model as defined in the original sam2 repo according to the specified arguments, defining the model architecture.
Instantiating a configuration defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
[facebook/sam2.1-hiera-tiny](https://huggingface.co/facebook/sam2.1-hiera-tiny) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 96):
        The hidden dimension of the image encoder.
    num_attention_heads (`int`, *optional*, defaults to 1):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        The number of channels in the image.
    image_size (`list[int]`, *optional*, defaults to `[1024, 1024]`):
        The size of the image.
    patch_kernel_size (`list[int]`, *optional*, defaults to `[7, 7]`):
        The kernel size of the patch.
    patch_stride (`list[int]`, *optional*, defaults to `[4, 4]`):
        The stride of the patch.
    patch_padding (`list[int]`, *optional*, defaults to `[3, 3]`):
        The padding of the patch.
    query_stride (`list[int]`, *optional*, defaults to `[2, 2]`):
        The downsample stride between stages.
    window_positional_embedding_background_size (`list[int]`, *optional*, defaults to `[7, 7]`):
        The window size per stage when not using global attention.
    num_query_pool_stages (`int`, *optional*, defaults to 3):
        The number of query pool stages.
    blocks_per_stage (`list[int]`, *optional*, defaults to `[1, 2, 7, 2]`):
        The number of blocks per stage.
    embed_dim_per_stage (`list[int]`, *optional*, defaults to `[96, 192, 384, 768]`):
        The embedding dimension per stage.
    num_attention_heads_per_stage (`list[int]`, *optional*, defaults to `[1, 2, 4, 8]`):
        The number of attention heads per stage.
    window_size_per_stage (`list[int]`, *optional*, defaults to `[8, 4, 14, 7]`):
        The window size per stage.
    global_attention_blocks (`list[int]`, *optional*, defaults to `[5, 7, 9]`):
        The blocks where global attention is used.
    mlp_ratio (`float`, *optional*, defaults to 4.0):
        The ratio of the MLP hidden dimension to the embedding dimension.
    hidden_act (`str`, *optional*, defaults to `"gelu"`):
        The non-linear activation function in the neck.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon for the layer normalization.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

backbone_configsam2_hiera_det_modelc                   > [         TU ]  " S0 UD6  Ub  UOSS/nUb  UOSS/nUb  UOSS/nUb  UOSS/nUb  UOSS/nU	b  U	OSS/n	Ub  UO/ SQnUb  UO/ SQnUb  UO/ SQnUb  UO/ S	QnUb  UO/ S
QnXl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        Xl        Xl        Xl        Xl        UU l        UU l        UU l        UU l        g )N         r   r   )   r   r   r   )`           )r   r   r      )r   r      r   )   r   	    )super__init__hidden_sizenum_attention_headsnum_channels
image_sizepatch_kernel_sizepatch_stridepatch_paddingquery_stride+window_positional_embedding_background_sizenum_query_pool_stagesblocks_per_stageembed_dim_per_stagenum_attention_heads_per_stagewindow_size_per_stageglobal_attention_blocks	mlp_ratio
hidden_actlayer_norm_epsinitializer_range)selfr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   kwargs	__class__s                        e/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/sam2/configuration_sam2.pyr   Sam2HieraDetConfig.__init__P   sM   . 	"6"#-#9Zd|
1B1N-UVXYTZ'3'?|aV)6)BA'3'?|aV ;F 8Q 	4
 0@/K+Q]5H5T1Zm-J-V)\h 	& :O9Z 5`m=T=`"9fo&#6 ($!2(*(;f8%:" 0#6 -J*%:"'>$"$,!2    )r'   r(   r+   r-   r   r    r/   r.   r,   r   r)   r   r&   r!   r#   r"   r$   r%   r*   )r   r   r   NNNNNNr   NNNNNg      @geluư>{Gz?)
__name__
__module____qualname____firstlineno____doc__base_config_key
model_typer   __static_attributes____classcell__r2   s   @r3   r	   r	      sZ    1f (O'J 48 &*" $)=3 =3r5   r	   c                   X   ^  \ rS rSrSrSrSrS\0r            SU 4S jjr	Sr
U =r$ )	Sam2VisionConfig   an  
This is the configuration class to store the configuration of a [`Sam2VisionModel`]. It is used to instantiate a SAM
vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
[facebook/sam2.1-hiera-tiny](https://huggingface.co/facebook/sam2.1-hiera-tiny) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    backbone_config (`Union[dict, "PretrainedConfig"]`, *optional*):
        Configuration for the vision backbone. This is used to instantiate the backbone using
        `AutoModel.from_config`.
    backbone_channel_list (`List[int]`, *optional*, defaults to `[768, 384, 192, 96]`):
        The list of channel dimensions for the backbone.
    backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
        The spatial sizes of the feature maps from the backbone.
    fpn_hidden_size (`int`, *optional*, defaults to 256):
        The hidden dimension of the FPN.
    fpn_kernel_size (`int`, *optional*, defaults to 1):
        The kernel size for the convolutions in the neck.
    fpn_stride (`int`, *optional*, defaults to 1):
        The stride for the convolutions in the neck.
    fpn_padding (`int`, *optional*, defaults to 0):
        The padding for the convolutions in the neck.
    fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
        The levels for the top-down FPN connections.
    num_feature_levels (`int`, *optional*, defaults to 3):
        The number of feature levels from the FPN to use.
    hidden_act (`str`, *optional*, defaults to `"gelu"`):
        The non-linear activation function in the neck.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon for the layer normalization.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

vision_configsam2_vision_modelr   c                   > [         TU ]  " S	0 UD6  Uc  / SQOUnUc  SS/SS/SS//OUnUc  SS/OUn[        U[        5      (       a(  UR	                  SS5      US'   [
        US      " S	0 UD6nO#[        U[        5      (       a  OUc
  [        5       nXl        X l        X0l	        X@l
        XPl        X`l        Xpl        Xl        Xl        Xl        Xl        Xl        g )
N)r   r   r   r         @   r   r   r?   r   r   )r   r   
isinstancedictgetr   r	   r   backbone_channel_listbackbone_feature_sizesfpn_hidden_sizefpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levelsnum_feature_levelsr-   r.   r/   )r0   r   rO   rP   rQ   rR   rS   rT   rU   rV   r-   r.   r/   r1   r2   s                 r3   r   Sam2VisionConfig.__init__   s      	"6"7L7T 3Zo2H2Pc3Z#sb"X.Vl 	 )<(Cq!fI\ot,,,;,?,?Nd,eOL),_\-JK^o^O);<<$02O. &;"&<#..$&#6 "4$,!2r5   )rO   r   rP   rQ   rR   rT   rS   rU   r-   r/   r.   rV   )NNNrI   r   r       Nr   r6   r7   r8   )r9   r:   r;   r<   r=   r>   r?   r   sub_configsr   r@   rA   rB   s   @r3   rD   rD      sQ    $L &O$J:K "# .3 .3r5   rD   c                   D   ^  \ rS rSrSrSr        SU 4S jjrSrU =r$ )Sam2PromptEncoderConfig   a  
This is the configuration class to store the configuration of a [`Sam2PromptEncoder`]. The [`Sam2PromptEncoder`]
module is used to encode the input 2D points and bounding boxes.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 256):
        Dimensionality of the hidden states.
    image_size (`int`, *optional*, defaults to 1024):
        The expected output resolution of the image.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    mask_input_channels (`int`, *optional*, defaults to 16):
        The number of channels to be fed to the `MaskDecoder` module.
    num_point_embeddings (`int`, *optional*, defaults to 4):
        The number of point embeddings to be used.
    hidden_act (`str`, *optional*, defaults to `"gelu"`):
        The non-linear activation function in the encoder and pooler.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    scale (`float`, *optional*, defaults to 1):
        The scale factor for the prompt encoder.
prompt_encoder_configc	                    > [         T
U ]  " S0 U	D6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        g Nr   )
r   r   r   r    
patch_sizemask_input_channelsnum_point_embeddingsr-   r.   scale)r0   r   r    r`   ra   rb   r-   r.   rc   r1   r2   s             r3   r    Sam2PromptEncoderConfig.__init__  sA     	"6"&$$#6 $8!$,
r5   )r-   r   r    r.   ra   rb   r`   rc   )rI   r      re   r   r6   r7   r   	r9   r:   r;   r<   r=   r>   r   r@   rA   rB   s   @r3   r[   r[      s3    4 .O  r5   r[   c                   L   ^  \ rS rSrSrSr            SU 4S jjrSrU =r$ )Sam2MaskDecoderConfigi"  a	  
This is the configuration class to store the configuration of a [`Sam2MaskDecoder`]. It is used to instantiate a SAM2
memory encoder according to the specified arguments, defining the model architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 256):
        Dimensionality of the hidden states.
    hidden_act (`str`, *optional*, defaults to `"gelu"`):
        The non-linear activation function in the SAM2 mask decoder.
    mlp_dim (`int`, *optional*, defaults to 2048):
        The dimension of the MLP in the two-way transformer.
    num_hidden_layers (`int`, *optional*, defaults to 2):
        The number of hidden layers in the two-way transformer.
    num_attention_heads (`int`, *optional*, defaults to 8):
        The number of attention heads in the two-way transformer.
    attention_downsample_rate (`int`, *optional*, defaults to 2):
        The downsample rate for the attention layers.
    num_multimask_outputs (`int`, *optional*, defaults to 3):
        The number of multimask outputs.
    iou_head_depth (`int`, *optional*, defaults to 3):
        The depth of the IoU head.
    iou_head_hidden_dim (`int`, *optional*, defaults to 256):
        The hidden dimension of the IoU head.
    dynamic_multimask_via_stability (`bool`, *optional*, defaults to `True`):
        Whether to use dynamic multimask via stability.
    dynamic_multimask_stability_delta (`float`, *optional*, defaults to 0.05):
        The stability delta for the dynamic multimask.
    dynamic_multimask_stability_thresh (`float`, *optional*, defaults to 0.98):
        The stability threshold for the dynamic multimask.

mask_decoder_configc                    > [         TU ]  " S0 UD6  Xl        Xpl        X l        Xl        Xl        Xl        Xl        Xl	        X@l
        Xl        XPl        X0l        X`l        g r_   )r   r   r   num_multimask_outputsr-   iou_head_depthiou_head_hidden_dimdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_threshnum_hidden_layersr   mlp_dimattention_downsample_rate)r0   r   r-   rr   rq   r   rs   rk   rl   rm   rn   ro   rp   r1   r2   s                 r3   r   Sam2MaskDecoderConfig.__init__H  sc      	"6"&%:"$,#6 /N,1R.2T/ "3&#6 )B&r5   )rs   ro   rp   rn   r-   r   rl   rm   rr   r   rq   rk   )rI   r6   i   r   r   r   r   r   rI   Tg?g\(\?rf   rB   s   @r3   rh   rh   "  sB    !F ,O "#(,*.+/ C  Cr5   rh   c                   H   ^  \ rS rSrSrSr\\\S.r	    SU 4S jjr
SrU =r$ )
Sam2Configik  a  
[`Sam2Config`] is the configuration class to store the configuration of a [`Sam2Model`]. It is used to instantiate a
SAM2 model according to the specified arguments, defining the memory attention, memory encoder, and image encoder
configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny
[facebook/sam2.1-hiera-tiny](https://huggingface.co/facebook/sam2.1-hiera-tiny) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vision_config (Union[`dict`, `Sam2VisionConfig`], *optional*):
        Dictionary of configuration options used to initialize [`Sam2VisionConfig`].
    prompt_encoder_config (Union[`dict`, `Sam2PromptEncoderConfig`], *optional*):
        Dictionary of configuration options used to initialize [`Sam2PromptEncoderConfig`].
    mask_decoder_config (Union[`dict`, `Sam2MaskDecoderConfig`], *optional*):
        Dictionary of configuration options used to initialize [`Sam2MaskDecoderConfig`].
    initializer_range (`float`, *optional*, defaults to 0.02):
        Standard deviation for parameter initialization.

Example:

```python
>>> from transformers import (
...     Sam2VisionConfig,
...     Sam2PromptEncoderConfig,
...     Sam2MaskDecoderConfig,
...     Sam2Model,
... )

>>> # Initializing a Sam2Config with `"facebook/sam2.1_hiera_tiny"` style configuration
>>> configuration = Sam2config()

>>> # Initializing a Sam2Model (with random weights) from the `"facebook/sam2.1_hiera_tiny"` style configuration
>>> model = Sam2Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a Sam2Config from a Sam2VisionConfig, Sam2PromptEncoderConfig, and Sam2MaskDecoderConfig

>>> # Initializing SAM2 vision encoder, memory attention, and memory encoder configurations
>>> vision_config = Sam2VisionConfig()
>>> prompt_encoder_config = Sam2PromptEncoderConfig()
>>> mask_decoder_config = Sam2MaskDecoderConfig()

>>> config = Sam2Config(vision_config, prompt_encoder_config, mask_decoder_config)
```sam2)rF   r]   ri   c                   > [         TU ]  " S0 UD6  Ub  UO0 nUb  UO0 nUb  UO0 n[        U[        5      (       a'  UR	                  SS5      US'   [
        US      " S0 UD6n[        U[        5      (       a  UR                  5       n[        U[        5      (       a  UR                  5       nXl	        [        S0 UD6U l
        [        S0 UD6U l        X@l        g )Nr?   rG   r   )r   r   rL   rM   rN   r   r[   to_dictrh   rF   r]   ri   r/   )r0   rF   r]   ri   r/   r1   r2   s         r3   r   Sam2Config.__init__  s     	"6")6)B9N9Z 5`b5H5T1Z\mT***7*;*;LJ]*^M,'*=+FGX-XM+-DEE$9$A$A$C!)+@AA"5"="="?*%<%U?T%U"#8#O;N#O !2r5   )r/   ri   r]   rF   )NNNr8   )r9   r:   r;   r<   r=   r?   r   r[   rh   rY   r   r@   rA   rB   s   @r3   rv   rv   k  s8    .` J#!84K " 3 3r5   rv   )rv   r	   rD   r[   rh   N)r=   configuration_utilsr   utilsr   autor   r   
get_loggerr9   loggerr	   rD   r[   rh   rv   __all__r   r5   r3   <module>r      s|     3  - 
		H	%t3) t3n[3' [3|1. 1hFC, FCRQ3! Q3hr5   