
    cCi                         S SK JrJr  S SKrS SKJr  S SKJr  SSKJ	r	J
r
  SSKJr  SSKJrJrJr  SS	KJrJrJrJr   " S
 S\5      r\ " S S\5      5       r " S S\\5      r\" SS9 " S S\\5      5       r/ SQrg)    )OptionalUnionN)IJepaConfig   )BaseModelOutputWithPoolingImageClassifierOutput)Unpack)TransformersKwargsauto_docstring	torch_int   )ViTEmbeddingsViTForImageClassificationViTModelViTPreTrainedModelc            	          ^  \ rS rSrSS\S\SS4U 4S jjjrS\R                  S\	S	\	S\R                  4S
 jr
  SS\R                  S\\R                     S\S\R                  4S jjrSrU =r$ )IJepaEmbeddings   configuse_mask_tokenreturnNc                    > [         TU ]  X5        U ?U R                  R                  n[
        R                  " [        R                  " SX1R                  5      5      U l
        g )N   )super__init__	cls_tokenpatch_embeddingsnum_patchesnn	Parametertorchrandnhidden_sizeposition_embeddings)selfr   r   r   	__class__s       a/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/ijepa/modular_ijepa.pyr   IJepaEmbeddings.__init__   sH    0N++77#%<<A{L^L^0_#`     
embeddingsheightwidthc                 ,   UR                   S   nU R                  R                   S   n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  nUR                   S   nX R
                  -  nX0R
                  -  n	[        US-  5      n
UR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SSS	9nUR                  SSSS5      R                  SSU5      nU$ )
a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   g      ?r   r   r   bicubicF)sizemodealign_corners)shaper$   r!   jit
is_tracing
patch_sizer   reshapepermuter   
functionalinterpolateview)r%   r*   r+   r,   r   num_positionspatch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r'   interpolate_pos_encoding(IJepaEmbeddings.interpolate_pos_encoding   s    !&&q)0066q9 yy##%%+*F6?+++22r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nr)   pixel_valuesbool_masked_posrB   c                 n   UR                   u  pEpgU R                  XS9nUbX  UR                   S   n	U R                  R                  XIS5      n
UR	                  S5      R                  U
5      nUSU-
  -  X-  -   nU(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nU$ )N)rB   r   r.         ?)	r3   r   
mask_tokenexpand	unsqueezetype_asrB   r$   dropout)r%   rD   rE   rB   
batch_size_r+   r,   r*   
seq_lengthmask_tokensmasks               r'   forwardIJepaEmbeddings.forward=   s     (4'9'9$
v**<*k
&#))!,J//00LK",,R088ED#sTz2[5GGJ $#&C&CJX]&^^J#&>&>>J\\*-
r)   )r$   )F)NF)__name__
__module____qualname____firstlineno__r   boolr   r!   TensorintrB   r   
BoolTensorrR   __static_attributes____classcell__r&   s   @r'   r   r      s    a{ aD aT a a%5<< % %UX %]b]i]i %T 7;).	ll "%"2"23 #'	
 
 r)   r   c                   n    \ rS rSrS\\R                  \R                  \R                  4   SS4S jr	Sr
g)IJepaPreTrainedModelX   moduler   Nc                    [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR                  R                  5      UR                  l        UR                  b%  UR                  R                  R                  5         gg[        U[        R                   5      (       aJ  UR                  R                  R                  5         UR                  R                  R#                  S5        g[        U[$        5      (       a  [        R                  R                  UR&                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR&                  R                  5      UR&                  l        UR(                  b%  UR(                  R                  R                  5         ggg)zInitialize the weightsg        )meanstdNrG   )
isinstancer   LinearConv2dinittrunc_normal_weightdatator!   float32r   initializer_rangedtypebiaszero_	LayerNormfill_r   r$   rH   )r%   rb   s     r'   _init_weights"IJepaPreTrainedModel._init_weightsZ   s   fryy"))455 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '--KK""$MM$$S)00.0gg.C.C**//225==AKK11 /D / b++112	 &&+
   ,!!&&,,. - 1r)    )rT   rU   rV   rW   r   r   rg   rh   rs   ru   r\   rw   r)   r'   r`   r`   X   s.    /E"))RYY*L$M /RV /r)   r`   c                   <   ^  \ rS rSrSS\S\S\4U 4S jjjrSrU =r$ )
IJepaModelq   r   add_pooling_layerr   c                 L   > [         TU ]  U5        Xl        [        XS9U l        g)z
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether to use a mask token for masked image modeling.
)r   N)r   r   r   r   r*   )r%   r   r{   r   r&   s       r'   r   IJepaModel.__init__r   s#     	 )&Pr)   )r   r*   )FF)	rT   rU   rV   rW   r   rX   r   r\   r]   r^   s   @r'   ry   ry   q   s(    	Q{ 	Qt 	Q]a 	Q 	Qr)   ry   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )custom_introc                      ^  \ rS rSrS\4U 4S jjr    SS\\R                     S\\R                     S\\R                     S\\	   S\
\   S	\4S
 jjrSrU =r$ )IJepaForImageClassification~   r   c                 b   > [         TU ]  U5        [        USS9U l        U R	                  5         g )NF)r{   )r   r   ry   ijepa	post_init)r%   r   r&   s     r'   r   $IJepaForImageClassification.__init__   s(     %@
r)   rD   	head_masklabelsrB   kwargsr   c                    U R                   " U4UUS.UD6nUR                  nU R                  UR                  SS95      nSn	Ub  U R                  " X8U R
                  40 UD6n	[        U	UUR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
)r   rB   r   )r>   N)losslogitshidden_states
attentions)	r   last_hidden_state
classifierrd   loss_functionr   r   r   r   )
r%   rD   r   r   rB   r   outputssequence_outputr   r   s
             r'   rR   #IJepaForImageClassification.forward   s     /3jj/
%=/
 	/
 "33!5!5!!5!<=%%fdkkLVLD$!//))	
 	
r)   )r   )NNNN)rT   rU   rV   rW   r   r   r   r!   rY   rX   r	   r
   r   rR   r\   r]   r^   s   @r'   r   r   ~   s    {  04,0)-37!
u||,!
 ELL)!
 &	!

 #+4.!
 +,!
 
!
 !
r)   r   )r`   ry   r   )typingr   r   r!   torch.nnr   -transformers.models.ijepa.configuration_ijepar   modeling_outputsr   r   processing_utilsr	   utilsr
   r   r   vit.modeling_vitr   r   r   r   r   r`   ry   r   __all__rw   r)   r'   <module>r      s    "   E Q & B B e eGm GT /- / /0
Q%x 
Q '
"68Q '
'
Tr)   