
    cCi%                        S r SSKJrJr  SSKrSSKJr  SSKrSSKJ	r	J
r
Jr  SSKJrJrJrJrJrJrJrJr  SSKJrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJ r    SSK!J"r"   " S S\5      r# " S S\5      r$ " S S\
5      r% " S S\	5      r& " S S\5      r' " S S\5      r( " S S\5      r) " S S\5      r* " S S\5      r+\ " S  S!\5      5       r,\" S"S#9 " S$ S%\5      5       r- " S& S'\5      r./ S(Qr/g))zPyTorch SAM 2 model.    )OptionalUnionN)
Sam2ConfigSam2MaskDecoderConfigSam2PromptEncoderConfig)Sam2AttentionSam2FeedForwardSam2LayerNorm	Sam2ModelSam2PreTrainedModelSam2TwoWayAttentionBlockSam2VisionEncoderOutputSam2VisionModel)TransformersKwargscheck_model_inputs   )PretrainedConfig)Unpack)auto_docstring   )CONFIG_MAPPING
AutoConfig)TimmWrapperModelc                   X   ^  \ rS rSrSrSrSrS\0r            SU 4S jjr	Sr
U =r$ )	EdgeTamVisionConfig1   a\  
This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM
vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
[facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    backbone_config (`Union[dict, "PretrainedConfig"]`, *optional*):
        Configuration for the vision backbone. This is used to instantiate the backbone using
        `AutoModel.from_config`.
    backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`):
        The list of channel dimensions for the backbone.
    backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
        The spatial sizes of the feature maps from the backbone.
    fpn_hidden_size (`int`, *optional*, defaults to 256):
        The hidden dimension of the FPN.
    fpn_kernel_size (`int`, *optional*, defaults to 1):
        The kernel size for the convolutions in the neck.
    fpn_stride (`int`, *optional*, defaults to 1):
        The stride for the convolutions in the neck.
    fpn_padding (`int`, *optional*, defaults to 0):
        The padding for the convolutions in the neck.
    fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
        The levels for the top-down FPN connections.
    num_feature_levels (`int`, *optional*, defaults to 3):
        The number of feature levels from the FPN to use.
    hidden_act (`str`, *optional*, defaults to `"gelu"`):
        The non-linear activation function in the neck.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon for the layer normalization.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

vision_configedgetam_vision_modelbackbone_configc                   > [         TU ]  " S0 UD6  Uc  / SQOUnUc  SS/SS/SS//OUnUc  SS/OUn[        U[        5      (       a(  UR	                  SS5      US'   [
        US      " S0 UD6nO6[        U[        5      (       a  UnOUc  [        R                  " S	SS
/ SQS.S9nXl        X l	        X0l
        X@l        XPl        X`l        Xpl        Xl        Xl        Xl        Xl        Xl        g )N)i     `   0         @   r   r   
model_typetimm_wrapperztimm/repvit_m1.dist_in1kT)r      r   r   )in_chansfeatures_onlyout_indices)
model_args )super__init__
isinstancedictgetr   r   from_pretrainedr   backbone_channel_listbackbone_feature_sizesfpn_hidden_sizefpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levelsnum_feature_levels
hidden_actlayer_norm_epsinitializer_range)selfr   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   kwargs	__class__s                 e/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/edgetam/modular_edgetam.pyr0   EdgeTamVisionConfig.__init__^   s     	"6"6K6S 2Yn2H2Pc3Z#sb"X.Vl 	 )<(Cq!fI\ot,,,;,?,?n,]OL),_\-JK^o^O44-O$(88*()DQ]^O
  / &;"&<#..$&#6 "4$,!2    )r5   r   r6   r7   r8   r:   r9   r;   r=   r?   r>   r<   )NNNr$   r)   r)   r   Nr   gelugư>g{Gz?)__name__
__module____qualname____firstlineno____doc__base_config_keyr'   r   sub_configsr0   __static_attributes____classcell__)rB   s   @rC   r   r   1   sQ    $L &O'J:K "# 13 13rE   r   c                       \ rS rSrSrg)EdgeTamPromptEncoderConfig   r.   NrG   rH   rI   rJ   rN   r.   rE   rC   rQ   rQ          rE   rQ   c                       \ rS rSrSrg)EdgeTamMaskDecoderConfig   r.   NrS   r.   rE   rC   rV   rV      rT   rE   rV   c                       \ rS rSrSrg)EdgeTamConfig   r.   NrS   r.   rE   rC   rY   rY      rT   rE   rY   c                       \ rS rSrSrg)EdgeTamLayerNorm   r.   NrS   r.   rE   rC   r\   r\      rT   rE   r\   c                       \ rS rSrSrg)EdgeTamVisionEncoderOutput   r.   NrS   r.   rE   rC   r_   r_      rT   rE   r_   c                       \ rS rSrSrg)EdgeTamAttention   r.   NrS   r.   rE   rC   rb   rb      rT   rE   rb   c                       \ rS rSrSrg)EdgeTamTwoWayAttentionBlock   r.   NrS   r.   rE   rC   re   re      rT   rE   re   c                       \ rS rSrSrg)EdgeTamFeedForward   r.   NrS   r.   rE   rC   rh   rh      rT   rE   rh   c                       \ rS rSrS rSrg)EdgeTamPreTrainedModel   c                    U R                   R                  n[        U[        R                  [        R
                  [        R                  45      (       aV  UR                  R                  R                  SUS9  UR                  b$  UR                  R                  R                  5         O[        U[        R                  5      (       ac  UR                  R                  R                  SUS9  UR                  b1  UR                  R                  UR                     R                  5         On[        U[        R                  [        45      (       aI  UR                  R                  R!                  S5        UR                  R                  R                  5         [        U["        5      (       a3  UR$                  b%  UR$                  R                  R                  5         g g g )Ng        )meanstdg      ?)configr?   r1   nnLinearConv2dConvTranspose2dweightdatanormal_biaszero_	Embeddingpadding_idx	LayerNormr\   fill_EdgeTamModelno_memory_embedding)r@   modulero   s      rC   _init_weights$EdgeTamPreTrainedModel._init_weights   sM   kk++fryy"))R5G5GHIIMM&&CS&9{{&  &&(--MM&&CS&9!!-""6#5#56<<>/? @AAMM$$S)KK""$fl++))5**//557 6 ,rE   r.   N)rG   rH   rI   rJ   r   rN   r.   rE   rC   rk   rk      s    8rE   rk   zN
    The vision model from EdgeTAM without any head or projection on top.
    )custom_introc            
           \ rS rSr\rSr\\S.rS r	\
" 5        S
S\\R                     S\\   S\\\4   4S jj5       rS	rg)EdgeTamVisionModel   pixel_values)hidden_states
attentionsc                     [        S5      eNz2Can't get input embeddings from timm wrapper modelNotImplementedErrorr@   s    rC   get_input_embeddings'EdgeTamVisionModel.get_input_embeddings       !"VWWrE   NrA   returnc           	      >   Uc  [        S5      eU R                  U5      nUR                  nU Vs/ s H  oUR                  SSSS5      PM     nnU R	                  U5      u  pgX`R
                  * S  S S S2   nXpR
                  * S  S S S2   n[        US   UUS9$ s  snf )Nz You have to specify pixel_valuesr   r   r   r)   )last_hidden_statefpn_hidden_statesfpn_position_encoding)
ValueErrorbackboner   permuteneckr<   r_   )r@   r   rA   backbone_outputintermediate_hidden_stateshidden_stater   r   s           rC   forwardEdgeTamVisionModel.forward   s     ?@@ --5%4%F%F"[u%v[u<&:&:1aA&F[u"%v3799=W3X0-/F/F.F.HI$B$O 57N7N6N6P QRVTVRV W)8</"7
 	
 &ws   Br.   )N)rG   rH   rI   rJ   r   config_classmain_input_namer   _can_record_outputsr   r   r   torchFloatTensorr   r   r   tupler_   r   rN   r.   rE   rC   r   r      st     'L$O,<L\]X  59
u001
 +,
 
u00	1	
 
rE   r   c                   "    \ rS rSr/ SQrS rSrg)r~      )z
^memory_.*z^mask_downsample.*zspatial_perceiver.*z^object_pointer_proj.*z0^temporal_positional_encoding_projection_layer.*no_memory_positional_encodingno_object_pointer%occlusion_spatial_embedding_parameterc                     [        S5      er   r   r   s    rC   r   !EdgeTamModel.get_input_embeddings   r   rE   r.   N)rG   rH   rI   rJ   "_keys_to_ignore_on_load_unexpectedr   rN   r.   rE   rC   r~   r~      s    	*&XrE   r~   )r~   r   rk   rY   r   rQ   rV   )0rK   typingr   r   r   torch.nnrq   torch.utils.checkpoint+transformers.models.sam2.configuration_sam2r   r   r   &transformers.models.sam2.modeling_sam2r   r	   r
   r   r   r   r   r   transformers.utils.genericr   r   configuration_utilsr   processing_utilsr   utilsr   autor   r   6transformers.models.timm_wrapper.modeling_timm_wrapperr   r   rQ   rV   rY   r\   r_   rb   re   rh   rk   r   r~   __all__r.   rE   rC   <module>r      s    "    r r	 	 	 N 3 & . W^3* ^3B	!8 		4 		J 		} 		!8 		} 		": 		 	 80 8 8& 

 


DX9 X rE   